# Library includes
################################################################################
ifeq ($(CONFIG_LIBINTEL_INTRINSICS),y)
+CINCLUDES-$(call have_gcc) += -I$(LIBINTEL_INTRINSICS_BASE)/include-gcc
+CXXINCLUDES-$(call have_gcc) += -I$(LIBINTEL_INTRINSICS_BASE)/include-gcc
+
CINCLUDES-$(call have_clang) += -I$(LIBINTEL_INTRINSICS_BASE)/include-llvm
CXXINCLUDES-$(call have_clang) += -I$(LIBINTEL_INTRINSICS_BASE)/include-llvm
endif
--- /dev/null
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <adxintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _ADXINTRIN_H_INCLUDED
+#define _ADXINTRIN_H_INCLUDED
+
+extern __inline unsigned char
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_subborrow_u32 (unsigned char __CF, unsigned int __X,
+ unsigned int __Y, unsigned int *__P)
+{
+ return __builtin_ia32_sbb_u32 (__CF, __X, __Y, __P);
+}
+
+extern __inline unsigned char
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_addcarry_u32 (unsigned char __CF, unsigned int __X,
+ unsigned int __Y, unsigned int *__P)
+{
+ return __builtin_ia32_addcarryx_u32 (__CF, __X, __Y, __P);
+}
+
+extern __inline unsigned char
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_addcarryx_u32 (unsigned char __CF, unsigned int __X,
+ unsigned int __Y, unsigned int *__P)
+{
+ return __builtin_ia32_addcarryx_u32 (__CF, __X, __Y, __P);
+}
+
+#ifdef __x86_64__
+extern __inline unsigned char
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_subborrow_u64 (unsigned char __CF, unsigned long long __X,
+ unsigned long long __Y, unsigned long long *__P)
+{
+ return __builtin_ia32_sbb_u64 (__CF, __X, __Y, __P);
+}
+
+extern __inline unsigned char
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_addcarry_u64 (unsigned char __CF, unsigned long long __X,
+ unsigned long long __Y, unsigned long long *__P)
+{
+ return __builtin_ia32_addcarryx_u64 (__CF, __X, __Y, __P);
+}
+
+extern __inline unsigned char
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_addcarryx_u64 (unsigned char __CF, unsigned long long __X,
+ unsigned long long __Y, unsigned long long *__P)
+{
+ return __builtin_ia32_addcarryx_u64 (__CF, __X, __Y, __P);
+}
+#endif
+
+#endif /* _ADXINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2007-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Implemented from the specification included in the AMD Programmers
+ Manual Update, version 2.x */
+
+#ifndef _AMMINTRIN_H_INCLUDED
+#define _AMMINTRIN_H_INCLUDED
+
+/* We need definitions from the SSE3, SSE2 and SSE header files*/
+#include <pmmintrin.h>
+
+#ifndef __SSE4A__
+#pragma GCC push_options
+#pragma GCC target("sse4a")
+#define __DISABLE_SSE4A__
+#endif /* __SSE4A__ */
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_sd (double * __P, __m128d __Y)
+{
+ __builtin_ia32_movntsd (__P, (__v2df) __Y);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_ss (float * __P, __m128 __Y)
+{
+ __builtin_ia32_movntss (__P, (__v4sf) __Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_si64 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_extrq ((__v2di) __X, (__v16qi) __Y);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extracti_si64 (__m128i __X, unsigned const int __I, unsigned const int __L)
+{
+ return (__m128i) __builtin_ia32_extrqi ((__v2di) __X, __I, __L);
+}
+#else
+#define _mm_extracti_si64(X, I, L) \
+ ((__m128i) __builtin_ia32_extrqi ((__v2di)(__m128i)(X), \
+ (unsigned int)(I), (unsigned int)(L)))
+#endif
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_si64 (__m128i __X,__m128i __Y)
+{
+ return (__m128i) __builtin_ia32_insertq ((__v2di)__X, (__v2di)__Y);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_inserti_si64(__m128i __X, __m128i __Y, unsigned const int __I, unsigned const int __L)
+{
+ return (__m128i) __builtin_ia32_insertqi ((__v2di)__X, (__v2di)__Y, __I, __L);
+}
+#else
+#define _mm_inserti_si64(X, Y, I, L) \
+ ((__m128i) __builtin_ia32_insertqi ((__v2di)(__m128i)(X), \
+ (__v2di)(__m128i)(Y), \
+ (unsigned int)(I), (unsigned int)(L)))
+#endif
+
+#ifdef __DISABLE_SSE4A__
+#undef __DISABLE_SSE4A__
+#pragma GCC pop_options
+#endif /* __DISABLE_SSE4A__ */
+
+#endif /* _AMMINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+#error "Never use <amxbf16intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AMXBF16INTRIN_H_INCLUDED
+#define _AMXBF16INTRIN_H_INCLUDED
+
+#if !defined(__AMX_BF16__)
+#pragma GCC push_options
+#pragma GCC target("amx-bf16")
+#define __DISABLE_AMX_BF16__
+#endif /* __AMX_BF16__ */
+
+#if defined(__x86_64__)
+#define _tile_dpbf16ps_internal(dst,src1,src2) \
+ __asm__ volatile\
+ ("{tdpbf16ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdpbf16ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::)
+
+#define _tile_dpbf16ps(dst,src1,src2) \
+ _tile_dpbf16ps_internal (dst, src1, src2)
+
+#endif
+
+#ifdef __DISABLE_AMX_BF16__
+#undef __DISABLE_AMX_BF16__
+#pragma GCC pop_options
+#endif /* __DISABLE_AMX_BF16__ */
+
+#endif /* _AMXBF16INTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+#error "Never use <amxcomplexintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AMXCOMPLEXINTRIN_H_INCLUDED
+#define _AMXCOMPLEXINTRIN_H_INCLUDED
+
+#if !defined(__AMX_COMPLEX__)
+#pragma GCC push_options
+#pragma GCC target("amx-complex")
+#define __DISABLE_AMX_COMPLEX__
+#endif /* __AMX_COMPLEX__ */
+
+#if defined(__x86_64__)
+#define _tile_cmmimfp16ps_internal(src1_dst,src2,src3) \
+ __asm__ volatile\
+ ("{tcmmimfp16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|tcmmimfp16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::)
+
+#define _tile_cmmrlfp16ps_internal(src1_dst,src2,src3) \
+ __asm__ volatile\
+ ("{tcmmrlfp16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|tcmmrlfp16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::)
+
+#define _tile_cmmimfp16ps(src1_dst,src2,src3) \
+ _tile_cmmimfp16ps_internal (src1_dst, src2, src3)
+
+#define _tile_cmmrlfp16ps(src1_dst,src2,src3) \
+ _tile_cmmrlfp16ps_internal (src1_dst, src2, src3)
+
+#endif
+
+#ifdef __DISABLE_AMX_COMPLEX__
+#undef __DISABLE_AMX_COMPLEX__
+#pragma GCC pop_options
+#endif /* __DISABLE_AMX_COMPLEX__ */
+
+#endif /* _AMXCOMPLEXINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+#error "Never use <amxfp16intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AMXFP16INTRIN_H_INCLUDED
+#define _AMXFP16INTRIN_H_INCLUDED
+
+#if defined(__x86_64__)
+#define _tile_dpfp16ps_internal(dst,src1,src2) \
+ __asm__ volatile \
+ ("{tdpfp16ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdpfp16ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::)
+
+#define _tile_dpfp16ps(dst,src1,src2) \
+ _tile_dpfp16ps_internal (dst,src1,src2)
+
+#endif
+
+#ifdef __DISABLE_AMX_FP16__
+#undef __DISABLE_AMX_FP16__
+#pragma GCC pop_options
+#endif /* __DISABLE_AMX_FP16__ */
+
+#endif /* _AMXFP16INTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+#error "Never use <amxint8intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AMXINT8INTRIN_H_INCLUDED
+#define _AMXINT8INTRIN_H_INCLUDED
+
+#if !defined(__AMX_INT8__)
+#pragma GCC push_options
+#pragma GCC target("amx-int8")
+#define __DISABLE_AMX_INT8__
+#endif /* __AMX_INT8__ */
+
+#if defined(__x86_64__)
+#define _tile_int8_dp_internal(name,dst,src1,src2) \
+ __asm__ volatile \
+ ("{"#name"\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|"#name"\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::)
+
+#define _tile_dpbssd(dst,src1,src2) \
+ _tile_int8_dp_internal (tdpbssd, dst, src1, src2)
+
+#define _tile_dpbsud(dst,src1,src2) \
+ _tile_int8_dp_internal (tdpbsud, dst, src1, src2)
+
+#define _tile_dpbusd(dst,src1,src2) \
+ _tile_int8_dp_internal (tdpbusd, dst, src1, src2)
+
+#define _tile_dpbuud(dst,src1,src2) \
+ _tile_int8_dp_internal (tdpbuud, dst, src1, src2)
+
+#endif
+
+#ifdef __DISABLE_AMX_INT8__
+#undef __DISABLE_AMX_INT8__
+#pragma GCC pop_options
+#endif /* __DISABLE_AMX_INT8__ */
+
+#endif /* _AMXINT8INTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+#error "Never use <amxtileintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AMXTILEINTRIN_H_INCLUDED
+#define _AMXTILEINTRIN_H_INCLUDED
+
+#if !defined(__AMX_TILE__)
+#pragma GCC push_options
+#pragma GCC target("amx-tile")
+#define __DISABLE_AMX_TILE__
+#endif /* __AMX_TILE__ */
+
+#if defined(__x86_64__)
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_tile_loadconfig (const void *__config)
+{
+ __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config)));
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_tile_storeconfig (void *__config)
+{
+ __asm__ volatile ("sttilecfg\t%X0" : "=m" (*((void **)__config)));
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_tile_release (void)
+{
+ __asm__ volatile ("tilerelease" ::);
+}
+
+#define _tile_loadd(dst,base,stride) \
+ _tile_loadd_internal (dst, base, stride)
+
+#define _tile_loadd_internal(dst,base,stride) \
+ __asm__ volatile \
+ ("{tileloadd\t(%0,%1,1), %%tmm"#dst"|tileloadd\t%%tmm"#dst", [%0+%1*1]}" \
+ :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)))
+
+#define _tile_stream_loadd(dst,base,stride) \
+ _tile_stream_loadd_internal (dst, base, stride)
+
+#define _tile_stream_loadd_internal(dst,base,stride) \
+ __asm__ volatile \
+ ("{tileloaddt1\t(%0,%1,1), %%tmm"#dst"|tileloaddt1\t%%tmm"#dst", [%0+%1*1]}" \
+ :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)))
+
+#define _tile_stored(dst,base,stride) \
+ _tile_stored_internal (dst, base, stride)
+
+#define _tile_stored_internal(src,base,stride) \
+ __asm__ volatile \
+ ("{tilestored\t%%tmm"#src", (%0,%1,1)|tilestored\t[%0+%1*1], %%tmm"#src"}" \
+ :: "r" ((void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)) \
+ : "memory")
+
+#define _tile_zero(dst) \
+ _tile_zero_internal (dst)
+
+#define _tile_zero_internal(dst) \
+ __asm__ volatile \
+ ("tilezero\t%%tmm"#dst ::)
+
+#endif
+
+#ifdef __DISABLE_AMX_TILE__
+#undef __DISABLE_AMX_TILE__
+#pragma GCC pop_options
+#endif /* __DISABLE_AMX_TILE__ */
+
+#endif /* _AMXTILEINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2011-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+# error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX2INTRIN_H_INCLUDED
+#define _AVX2INTRIN_H_INCLUDED
+
+#ifndef __AVX2__
+#pragma GCC push_options
+#pragma GCC target("avx2")
+#define __DISABLE_AVX2__
+#endif /* __AVX2__ */
+
+/* Sum absolute 8-bit integer difference of adjacent groups of 4
+ byte integers in the first 2 operands. Starting offsets within
+ operands are determined by the 3rd mask operand. */
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M)
+{
+ return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X,
+ (__v32qi)__Y, __M);
+}
+#else
+#define _mm256_mpsadbw_epu8(X, Y, M) \
+ ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X), \
+ (__v32qi)(__m256i)(Y), (int)(M)))
+#endif
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_abs_epi8 (__m256i __A)
+{
+ return (__m256i)__builtin_ia32_pabsb256 ((__v32qi)__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_abs_epi16 (__m256i __A)
+{
+ return (__m256i)__builtin_ia32_pabsw256 ((__v16hi)__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_abs_epi32 (__m256i __A)
+{
+ return (__m256i)__builtin_ia32_pabsd256 ((__v8si)__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_packs_epi32 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_packssdw256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_packs_epi16 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_packsswb256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_packus_epi32 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_packusdw256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_packus_epi16 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_add_epi8 (__m256i __A, __m256i __B)
+{
+ return (__m256i) ((__v32qu)__A + (__v32qu)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_add_epi16 (__m256i __A, __m256i __B)
+{
+ return (__m256i) ((__v16hu)__A + (__v16hu)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_add_epi32 (__m256i __A, __m256i __B)
+{
+ return (__m256i) ((__v8su)__A + (__v8su)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_add_epi64 (__m256i __A, __m256i __B)
+{
+ return (__m256i) ((__v4du)__A + (__v4du)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_adds_epi8 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_adds_epi16 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_paddsw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_adds_epu8 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_paddusb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_adds_epu16 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N)
+{
+ return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A,
+ (__v4di)__B,
+ __N * 8);
+}
+#else
+/* In that case (__N*8) will be in vreg, and insn will not be matched. */
+/* Use define instead */
+#define _mm256_alignr_epi8(A, B, N) \
+ ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A), \
+ (__v4di)(__m256i)(B), \
+ (int)(N) * 8))
+#endif
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_and_si256 (__m256i __A, __m256i __B)
+{
+ return (__m256i) ((__v4du)__A & (__v4du)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_andnot_si256 (__m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_andnotsi256 ((__v4di)__A, (__v4di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_avg_epu8 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_pavgb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_avg_epu16 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_pavgw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M)
+{
+ return (__m256i) __builtin_ia32_pblendvb256 ((__v32qi)__X,
+ (__v32qi)__Y,
+ (__v32qi)__M);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M)
+{
+ return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X,
+ (__v16hi)__Y,
+ __M);
+}
+#else
+#define _mm256_blend_epi16(X, Y, M) \
+ ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X), \
+ (__v16hi)(__m256i)(Y), (int)(M)))
+#endif
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpeq_epi8 (__m256i __A, __m256i __B)
+{
+ return (__m256i) ((__v32qi)__A == (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpeq_epi16 (__m256i __A, __m256i __B)
+{
+ return (__m256i) ((__v16hi)__A == (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpeq_epi32 (__m256i __A, __m256i __B)
+{
+ return (__m256i) ((__v8si)__A == (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpeq_epi64 (__m256i __A, __m256i __B)
+{
+ return (__m256i) ((__v4di)__A == (__v4di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpgt_epi8 (__m256i __A, __m256i __B)
+{
+ return (__m256i) ((__v32qs)__A > (__v32qs)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpgt_epi16 (__m256i __A, __m256i __B)
+{
+ return (__m256i) ((__v16hi)__A > (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpgt_epi32 (__m256i __A, __m256i __B)
+{
+ return (__m256i) ((__v8si)__A > (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpgt_epi64 (__m256i __A, __m256i __B)
+{
+ return (__m256i) ((__v4di)__A > (__v4di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hadd_epi16 (__m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_phaddw256 ((__v16hi)__X,
+ (__v16hi)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hadd_epi32 (__m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_phaddd256 ((__v8si)__X, (__v8si)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hadds_epi16 (__m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_phaddsw256 ((__v16hi)__X,
+ (__v16hi)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hsub_epi16 (__m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_phsubw256 ((__v16hi)__X,
+ (__v16hi)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hsub_epi32 (__m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_phsubd256 ((__v8si)__X, (__v8si)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hsubs_epi16 (__m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_phsubsw256 ((__v16hi)__X,
+ (__v16hi)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maddubs_epi16 (__m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_pmaddubsw256 ((__v32qi)__X,
+ (__v32qi)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_madd_epi16 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_pmaddwd256 ((__v16hi)__A,
+ (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_epi8 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_pmaxsb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_epi16 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_epi32 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_pmaxsd256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_epu8 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_pmaxub256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_epu16 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_epu32 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_pmaxud256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_epi8 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_pminsb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_epi16 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_epi32 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_pminsd256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_epu8 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_pminub256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_epu16 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_epu32 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_pminud256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movemask_epi8 (__m256i __A)
+{
+ return __builtin_ia32_pmovmskb256 ((__v32qi)__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi8_epi16 (__m128i __X)
+{
+ return (__m256i) __builtin_ia32_pmovsxbw256 ((__v16qi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi8_epi32 (__m128i __X)
+{
+ return (__m256i) __builtin_ia32_pmovsxbd256 ((__v16qi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi8_epi64 (__m128i __X)
+{
+ return (__m256i) __builtin_ia32_pmovsxbq256 ((__v16qi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi16_epi32 (__m128i __X)
+{
+ return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi16_epi64 (__m128i __X)
+{
+ return (__m256i) __builtin_ia32_pmovsxwq256 ((__v8hi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi32_epi64 (__m128i __X)
+{
+ return (__m256i) __builtin_ia32_pmovsxdq256 ((__v4si)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu8_epi16 (__m128i __X)
+{
+ return (__m256i) __builtin_ia32_pmovzxbw256 ((__v16qi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu8_epi32 (__m128i __X)
+{
+ return (__m256i) __builtin_ia32_pmovzxbd256 ((__v16qi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu8_epi64 (__m128i __X)
+{
+ return (__m256i) __builtin_ia32_pmovzxbq256 ((__v16qi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu16_epi32 (__m128i __X)
+{
+ return (__m256i) __builtin_ia32_pmovzxwd256 ((__v8hi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu16_epi64 (__m128i __X)
+{
+ return (__m256i) __builtin_ia32_pmovzxwq256 ((__v8hi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu32_epi64 (__m128i __X)
+{
+ return (__m256i) __builtin_ia32_pmovzxdq256 ((__v4si)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mul_epi32 (__m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_pmuldq256 ((__v8si)__X, (__v8si)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mulhrs_epi16 (__m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_pmulhrsw256 ((__v16hi)__X,
+ (__v16hi)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mulhi_epu16 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_pmulhuw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mulhi_epi16 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mullo_epi16 (__m256i __A, __m256i __B)
+{
+ return (__m256i) ((__v16hu)__A * (__v16hu)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mullo_epi32 (__m256i __A, __m256i __B)
+{
+ return (__m256i) ((__v8su)__A * (__v8su)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mul_epu32 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_or_si256 (__m256i __A, __m256i __B)
+{
+ return (__m256i) ((__v4du)__A | (__v4du)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sad_epu8 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_psadbw256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shuffle_epi8 (__m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X,
+ (__v32qi)__Y);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shuffle_epi32 (__m256i __A, const int __mask)
+{
+ return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shufflehi_epi16 (__m256i __A, const int __mask)
+{
+ return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shufflelo_epi16 (__m256i __A, const int __mask)
+{
+ return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask);
+}
+#else
+#define _mm256_shuffle_epi32(A, N) \
+ ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N)))
+#define _mm256_shufflehi_epi16(A, N) \
+ ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N)))
+#define _mm256_shufflelo_epi16(A, N) \
+ ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N)))
+#endif
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sign_epi8 (__m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_psignb256 ((__v32qi)__X, (__v32qi)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sign_epi16 (__m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_psignw256 ((__v16hi)__X, (__v16hi)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sign_epi32 (__m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_bslli_epi128 (__m256i __A, const int __N)
+{
+ return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_slli_si256 (__m256i __A, const int __N)
+{
+ return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
+}
+#else
+#define _mm256_bslli_epi128(A, N) \
+ ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
+#define _mm256_slli_si256(A, N) \
+ ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
+#endif
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_slli_epi16 (__m256i __A, int __B)
+{
+ return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sll_epi16 (__m256i __A, __m128i __B)
+{
+ return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_slli_epi32 (__m256i __A, int __B)
+{
+ return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sll_epi32 (__m256i __A, __m128i __B)
+{
+ return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_slli_epi64 (__m256i __A, int __B)
+{
+ return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sll_epi64 (__m256i __A, __m128i __B)
+{
+ return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srai_epi16 (__m256i __A, int __B)
+{
+ return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sra_epi16 (__m256i __A, __m128i __B)
+{
+ return (__m256i)__builtin_ia32_psraw256 ((__v16hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srai_epi32 (__m256i __A, int __B)
+{
+ return (__m256i)__builtin_ia32_psradi256 ((__v8si)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sra_epi32 (__m256i __A, __m128i __B)
+{
+ return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_bsrli_epi128 (__m256i __A, const int __N)
+{
+ return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srli_si256 (__m256i __A, const int __N)
+{
+ return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
+}
+#else
+#define _mm256_bsrli_epi128(A, N) \
+ ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
+#define _mm256_srli_si256(A, N) \
+ ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
+#endif
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srli_epi16 (__m256i __A, int __B)
+{
+ return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srl_epi16 (__m256i __A, __m128i __B)
+{
+ return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srli_epi32 (__m256i __A, int __B)
+{
+ return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srl_epi32 (__m256i __A, __m128i __B)
+{
+ return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srli_epi64 (__m256i __A, int __B)
+{
+ return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srl_epi64 (__m256i __A, __m128i __B)
+{
+ return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sub_epi8 (__m256i __A, __m256i __B)
+{
+ return (__m256i) ((__v32qu)__A - (__v32qu)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sub_epi16 (__m256i __A, __m256i __B)
+{
+ return (__m256i) ((__v16hu)__A - (__v16hu)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sub_epi32 (__m256i __A, __m256i __B)
+{
+ return (__m256i) ((__v8su)__A - (__v8su)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sub_epi64 (__m256i __A, __m256i __B)
+{
+ return (__m256i) ((__v4du)__A - (__v4du)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_subs_epi8 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_subs_epi16 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_psubsw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_subs_epu8 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_psubusb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_subs_epu16 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_psubusw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpackhi_epi8 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_punpckhbw256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpackhi_epi16 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_punpckhwd256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpackhi_epi32 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_punpckhdq256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpackhi_epi64 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_punpckhqdq256 ((__v4di)__A, (__v4di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpacklo_epi8 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_punpcklbw256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpacklo_epi16 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_punpcklwd256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpacklo_epi32 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_punpckldq256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpacklo_epi64 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_punpcklqdq256 ((__v4di)__A, (__v4di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_xor_si256 (__m256i __A, __m256i __B)
+{
+ return (__m256i) ((__v4du)__A ^ (__v4du)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_stream_load_si256 (__m256i const *__X)
+{
+ return (__m256i) __builtin_ia32_movntdqa256 ((__v4di *) __X);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcastss_ps (__m128 __X)
+{
+ return (__m128) __builtin_ia32_vbroadcastss_ps ((__v4sf)__X);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastss_ps (__m128 __X)
+{
+ return (__m256) __builtin_ia32_vbroadcastss_ps256 ((__v4sf)__X);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastsd_pd (__m128d __X)
+{
+ return (__m256d) __builtin_ia32_vbroadcastsd_pd256 ((__v2df)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastsi128_si256 (__m128i __X)
+{
+ return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X);
+}
+
+#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
+#define _mm_broadcastsd_pd(X) _mm_movedup_pd(X)
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M)
+{
+ return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X,
+ (__v4si)__Y,
+ __M);
+}
+#else
+#define _mm_blend_epi32(X, Y, M) \
+ ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X), \
+ (__v4si)(__m128i)(Y), (int)(M)))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M)
+{
+ return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X,
+ (__v8si)__Y,
+ __M);
+}
+#else
+#define _mm256_blend_epi32(X, Y, M) \
+ ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X), \
+ (__v8si)(__m256i)(Y), (int)(M)))
+#endif
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastb_epi8 (__m128i __X)
+{
+ return (__m256i) __builtin_ia32_pbroadcastb256 ((__v16qi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastw_epi16 (__m128i __X)
+{
+ return (__m256i) __builtin_ia32_pbroadcastw256 ((__v8hi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastd_epi32 (__m128i __X)
+{
+ return (__m256i) __builtin_ia32_pbroadcastd256 ((__v4si)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastq_epi64 (__m128i __X)
+{
+ return (__m256i) __builtin_ia32_pbroadcastq256 ((__v2di)__X);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcastb_epi8 (__m128i __X)
+{
+ return (__m128i) __builtin_ia32_pbroadcastb128 ((__v16qi)__X);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcastw_epi16 (__m128i __X)
+{
+ return (__m128i) __builtin_ia32_pbroadcastw128 ((__v8hi)__X);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcastd_epi32 (__m128i __X)
+{
+ return (__m128i) __builtin_ia32_pbroadcastd128 ((__v4si)__X);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcastq_epi64 (__m128i __X)
+{
+ return (__m128i) __builtin_ia32_pbroadcastq128 ((__v2di)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute4x64_pd (__m256d __X, const int __M)
+{
+ return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M);
+}
+#else
+#define _mm256_permute4x64_pd(X, M) \
+ ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M)))
+#endif
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutevar8x32_ps (__m256 __X, __m256i __Y)
+{
+ return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X, (__v8si)__Y);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute4x64_epi64 (__m256i __X, const int __M)
+{
+ return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M);
+}
+#else
+#define _mm256_permute4x64_epi64(X, M) \
+ ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M)))
+#endif
+
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M)
+{
+ return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M);
+}
+#else
+#define _mm256_permute2x128_si256(X, Y, M) \
+ ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M)))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extracti128_si256 (__m256i __X, const int __M)
+{
+ return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M);
+}
+#else
+#define _mm256_extracti128_si256(X, M) \
+ ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M)))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M)
+{
+ return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M);
+}
+#else
+#define _mm256_inserti128_si256(X, Y, M) \
+ ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), \
+ (__v2di)(__m128i)(Y), \
+ (int)(M)))
+#endif
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskload_epi32 (int const *__X, __m256i __M )
+{
+ return (__m256i) __builtin_ia32_maskloadd256 ((const __v8si *)__X,
+ (__v8si)__M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskload_epi64 (long long const *__X, __m256i __M )
+{
+ return (__m256i) __builtin_ia32_maskloadq256 ((const __v4di *)__X,
+ (__v4di)__M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskload_epi32 (int const *__X, __m128i __M )
+{
+ return (__m128i) __builtin_ia32_maskloadd ((const __v4si *)__X,
+ (__v4si)__M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskload_epi64 (long long const *__X, __m128i __M )
+{
+ return (__m128i) __builtin_ia32_maskloadq ((const __v2di *)__X,
+ (__v2di)__M);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y )
+{
+ __builtin_ia32_maskstored256 ((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y )
+{
+ __builtin_ia32_maskstoreq256 ((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y )
+{
+ __builtin_ia32_maskstored ((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y )
+{
+ __builtin_ia32_maskstoreq (( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sllv_epi32 (__m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_psllv8si ((__v8si)__X, (__v8si)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sllv_epi32 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_psllv4si ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sllv_epi64 (__m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_psllv4di ((__v4di)__X, (__v4di)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sllv_epi64 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_psllv2di ((__v2di)__X, (__v2di)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srav_epi32 (__m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_psrav8si ((__v8si)__X, (__v8si)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srav_epi32 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_psrav4si ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srlv_epi32 (__m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_psrlv8si ((__v8si)__X, (__v8si)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srlv_epi32 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_psrlv4si ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srlv_epi64 (__m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_psrlv4di ((__v4di)__X, (__v4di)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srlv_epi64 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i32gather_pd (double const *__base, __m128i __index, const int __scale)
+{
+ __v2df __zero = _mm_setzero_pd ();
+ __v2df __mask = _mm_cmpeq_pd (__zero, __zero);
+
+ return (__m128d) __builtin_ia32_gathersiv2df (_mm_undefined_pd (),
+ __base,
+ (__v4si)__index,
+ __mask,
+ __scale);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i32gather_pd (__m128d __src, double const *__base, __m128i __index,
+ __m128d __mask, const int __scale)
+{
+ return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)__src,
+ __base,
+ (__v4si)__index,
+ (__v2df)__mask,
+ __scale);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i32gather_pd (double const *__base, __m128i __index, const int __scale)
+{
+ __v4df __zero = _mm256_setzero_pd ();
+ __v4df __mask = _mm256_cmp_pd (__zero, __zero, _CMP_EQ_OQ);
+
+ return (__m256d) __builtin_ia32_gathersiv4df (_mm256_undefined_pd (),
+ __base,
+ (__v4si)__index,
+ __mask,
+ __scale);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i32gather_pd (__m256d __src, double const *__base,
+ __m128i __index, __m256d __mask, const int __scale)
+{
+ return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)__src,
+ __base,
+ (__v4si)__index,
+ (__v4df)__mask,
+ __scale);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i64gather_pd (double const *__base, __m128i __index, const int __scale)
+{
+ __v2df __src = _mm_setzero_pd ();
+ __v2df __mask = _mm_cmpeq_pd (__src, __src);
+
+ return (__m128d) __builtin_ia32_gatherdiv2df (__src,
+ __base,
+ (__v2di)__index,
+ __mask,
+ __scale);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i64gather_pd (__m128d __src, double const *__base, __m128i __index,
+ __m128d __mask, const int __scale)
+{
+ return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)__src,
+ __base,
+ (__v2di)__index,
+ (__v2df)__mask,
+ __scale);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i64gather_pd (double const *__base, __m256i __index, const int __scale)
+{
+ __v4df __src = _mm256_setzero_pd ();
+ __v4df __mask = _mm256_cmp_pd (__src, __src, _CMP_EQ_OQ);
+
+ return (__m256d) __builtin_ia32_gatherdiv4df (__src,
+ __base,
+ (__v4di)__index,
+ __mask,
+ __scale);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i64gather_pd (__m256d __src, double const *__base,
+ __m256i __index, __m256d __mask, const int __scale)
+{
+ return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)__src,
+ __base,
+ (__v4di)__index,
+ (__v4df)__mask,
+ __scale);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i32gather_ps (float const *__base, __m128i __index, const int __scale)
+{
+ __v4sf __src = _mm_setzero_ps ();
+ __v4sf __mask = _mm_cmpeq_ps (__src, __src);
+
+ return (__m128) __builtin_ia32_gathersiv4sf (__src,
+ __base,
+ (__v4si)__index,
+ __mask,
+ __scale);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i32gather_ps (__m128 __src, float const *__base, __m128i __index,
+ __m128 __mask, const int __scale)
+{
+ return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)__src,
+ __base,
+ (__v4si)__index,
+ (__v4sf)__mask,
+ __scale);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i32gather_ps (float const *__base, __m256i __index, const int __scale)
+{
+ __v8sf __src = _mm256_setzero_ps ();
+ __v8sf __mask = _mm256_cmp_ps (__src, __src, _CMP_EQ_OQ);
+
+ return (__m256) __builtin_ia32_gathersiv8sf (__src,
+ __base,
+ (__v8si)__index,
+ __mask,
+ __scale);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i32gather_ps (__m256 __src, float const *__base,
+ __m256i __index, __m256 __mask, const int __scale)
+{
+ return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)__src,
+ __base,
+ (__v8si)__index,
+ (__v8sf)__mask,
+ __scale);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i64gather_ps (float const *__base, __m128i __index, const int __scale)
+{
+ __v4sf __src = _mm_setzero_ps ();
+ __v4sf __mask = _mm_cmpeq_ps (__src, __src);
+
+ return (__m128) __builtin_ia32_gatherdiv4sf (__src,
+ __base,
+ (__v2di)__index,
+ __mask,
+ __scale);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i64gather_ps (__m128 __src, float const *__base, __m128i __index,
+ __m128 __mask, const int __scale)
+{
+ return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)__src,
+ __base,
+ (__v2di)__index,
+ (__v4sf)__mask,
+ __scale);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i64gather_ps (float const *__base, __m256i __index, const int __scale)
+{
+ __v4sf __src = _mm_setzero_ps ();
+ __v4sf __mask = _mm_cmpeq_ps (__src, __src);
+
+ return (__m128) __builtin_ia32_gatherdiv4sf256 (__src,
+ __base,
+ (__v4di)__index,
+ __mask,
+ __scale);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i64gather_ps (__m128 __src, float const *__base,
+ __m256i __index, __m128 __mask, const int __scale)
+{
+ return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)__src,
+ __base,
+ (__v4di)__index,
+ (__v4sf)__mask,
+ __scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i32gather_epi64 (long long int const *__base,
+ __m128i __index, const int __scale)
+{
+ __v2di __src = __extension__ (__v2di){ 0, 0 };
+ __v2di __mask = __extension__ (__v2di){ ~0, ~0 };
+
+ return (__m128i) __builtin_ia32_gathersiv2di (__src,
+ __base,
+ (__v4si)__index,
+ __mask,
+ __scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i32gather_epi64 (__m128i __src, long long int const *__base,
+ __m128i __index, __m128i __mask, const int __scale)
+{
+ return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)__src,
+ __base,
+ (__v4si)__index,
+ (__v2di)__mask,
+ __scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i32gather_epi64 (long long int const *__base,
+ __m128i __index, const int __scale)
+{
+ __v4di __src = __extension__ (__v4di){ 0, 0, 0, 0 };
+ __v4di __mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
+
+ return (__m256i) __builtin_ia32_gathersiv4di (__src,
+ __base,
+ (__v4si)__index,
+ __mask,
+ __scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i32gather_epi64 (__m256i __src, long long int const *__base,
+ __m128i __index, __m256i __mask,
+ const int __scale)
+{
+ return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)__src,
+ __base,
+ (__v4si)__index,
+ (__v4di)__mask,
+ __scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i64gather_epi64 (long long int const *__base,
+ __m128i __index, const int __scale)
+{
+ __v2di __src = __extension__ (__v2di){ 0, 0 };
+ __v2di __mask = __extension__ (__v2di){ ~0, ~0 };
+
+ return (__m128i) __builtin_ia32_gatherdiv2di (__src,
+ __base,
+ (__v2di)__index,
+ __mask,
+ __scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i64gather_epi64 (__m128i __src, long long int const *__base,
+ __m128i __index, __m128i __mask, const int __scale)
+{
+ return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)__src,
+ __base,
+ (__v2di)__index,
+ (__v2di)__mask,
+ __scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i64gather_epi64 (long long int const *__base,
+ __m256i __index, const int __scale)
+{
+ __v4di __src = __extension__ (__v4di){ 0, 0, 0, 0 };
+ __v4di __mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
+
+ return (__m256i) __builtin_ia32_gatherdiv4di (__src,
+ __base,
+ (__v4di)__index,
+ __mask,
+ __scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i64gather_epi64 (__m256i __src, long long int const *__base,
+ __m256i __index, __m256i __mask,
+ const int __scale)
+{
+ return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)__src,
+ __base,
+ (__v4di)__index,
+ (__v4di)__mask,
+ __scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i32gather_epi32 (int const *__base, __m128i __index, const int __scale)
+{
+ __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 };
+ __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
+
+ return (__m128i) __builtin_ia32_gathersiv4si (__src,
+ __base,
+ (__v4si)__index,
+ __mask,
+ __scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i32gather_epi32 (__m128i __src, int const *__base, __m128i __index,
+ __m128i __mask, const int __scale)
+{
+ return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)__src,
+ __base,
+ (__v4si)__index,
+ (__v4si)__mask,
+ __scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i32gather_epi32 (int const *__base, __m256i __index, const int __scale)
+{
+ __v8si __src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 };
+ __v8si __mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 };
+
+ return (__m256i) __builtin_ia32_gathersiv8si (__src,
+ __base,
+ (__v8si)__index,
+ __mask,
+ __scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i32gather_epi32 (__m256i __src, int const *__base,
+ __m256i __index, __m256i __mask,
+ const int __scale)
+{
+ return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)__src,
+ __base,
+ (__v8si)__index,
+ (__v8si)__mask,
+ __scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i64gather_epi32 (int const *__base, __m128i __index, const int __scale)
+{
+ __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 };
+ __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
+
+ return (__m128i) __builtin_ia32_gatherdiv4si (__src,
+ __base,
+ (__v2di)__index,
+ __mask,
+ __scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i64gather_epi32 (__m128i __src, int const *__base, __m128i __index,
+ __m128i __mask, const int __scale)
+{
+ return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)__src,
+ __base,
+ (__v2di)__index,
+ (__v4si)__mask,
+ __scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i64gather_epi32 (int const *__base, __m256i __index, const int __scale)
+{
+ __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 };
+ __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
+
+ return (__m128i) __builtin_ia32_gatherdiv4si256 (__src,
+ __base,
+ (__v4di)__index,
+ __mask,
+ __scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i64gather_epi32 (__m128i __src, int const *__base,
+ __m256i __index, __m128i __mask,
+ const int __scale)
+{
+ return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)__src,
+ __base,
+ (__v4di)__index,
+ (__v4si)__mask,
+ __scale);
+}
+#else /* __OPTIMIZE__ */
+#define _mm_i32gather_pd(BASE, INDEX, SCALE) \
+ (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (), \
+ (double const *) (BASE), \
+ (__v4si)(__m128i) (INDEX), \
+ (__v2df) \
+ _mm_cmpeq_pd (_mm_setzero_pd (),\
+ _mm_setzero_pd ()),\
+ (int) (SCALE))
+
+#define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
+ (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d) (SRC), \
+ (double const *) (BASE), \
+ (__v4si)(__m128i) (INDEX), \
+ (__v2df)(__m128d) (MASK), \
+ (int) (SCALE))
+
+#define _mm256_i32gather_pd(BASE, INDEX, SCALE) \
+ (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (), \
+ (double const *) (BASE), \
+ (__v4si)(__m128i) (INDEX), \
+ (__v4df) \
+ _mm256_cmp_pd (_mm256_setzero_pd (),\
+ _mm256_setzero_pd (),\
+ _CMP_EQ_OQ), \
+ (int) (SCALE))
+
+#define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
+ (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d) (SRC), \
+ (double const *) (BASE), \
+ (__v4si)(__m128i) (INDEX), \
+ (__v4df)(__m256d) (MASK), \
+ (int) (SCALE))
+
+#define _mm_i64gather_pd(BASE, INDEX, SCALE) \
+ (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (), \
+ (double const *) (BASE), \
+ (__v2di)(__m128i) (INDEX), \
+ (__v2df) \
+ _mm_cmpeq_pd (_mm_setzero_pd (),\
+ _mm_setzero_pd ()),\
+ (int) (SCALE))
+
+#define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
+ (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d) (SRC), \
+ (double const *) (BASE), \
+ (__v2di)(__m128i) (INDEX), \
+ (__v2df)(__m128d) (MASK), \
+ (int) (SCALE))
+
+#define _mm256_i64gather_pd(BASE, INDEX, SCALE) \
+ (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (), \
+ (double const *) (BASE), \
+ (__v4di)(__m256i) (INDEX), \
+ (__v4df) \
+ _mm256_cmp_pd (_mm256_setzero_pd (),\
+ _mm256_setzero_pd (),\
+ _CMP_EQ_OQ), \
+ (int) (SCALE))
+
+#define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \
+ (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d) (SRC), \
+ (double const *) (BASE), \
+ (__v4di)(__m256i) (INDEX), \
+ (__v4df)(__m256d) (MASK), \
+ (int) (SCALE))
+
+#define _mm_i32gather_ps(BASE, INDEX, SCALE) \
+ (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (), \
+ (float const *) (BASE), \
+ (__v4si)(__m128i) (INDEX), \
+ (__v4sf) \
+ _mm_cmpeq_ps (_mm_setzero_ps (),\
+ _mm_setzero_ps ()),\
+ (int) (SCALE))
+
+#define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
+ (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128) (SRC), \
+ (float const *) (BASE), \
+ (__v4si)(__m128i) (INDEX), \
+ (__v4sf)(__m128) (MASK), \
+ (int) (SCALE))
+
+#define _mm256_i32gather_ps(BASE, INDEX, SCALE) \
+ (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \
+ (float const *) (BASE), \
+ (__v8si)(__m256i) (INDEX), \
+ (__v8sf) \
+ _mm256_cmp_ps (_mm256_setzero_ps (),\
+ _mm256_setzero_ps (),\
+ _CMP_EQ_OQ), \
+ (int) (SCALE))
+
+#define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
+ (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256) (SRC), \
+ (float const *) (BASE), \
+ (__v8si)(__m256i) (INDEX), \
+ (__v8sf)(__m256) (MASK), \
+ (int) (SCALE))
+
+#define _mm_i64gather_ps(BASE, INDEX, SCALE) \
+ (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (), \
+ (float const *) (BASE), \
+ (__v2di)(__m128i) (INDEX), \
+ (__v4sf) \
+ _mm_cmpeq_ps (_mm_setzero_ps (),\
+ _mm_setzero_ps ()),\
+ (int) (SCALE))
+
+#define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
+ (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128) (SRC), \
+ (float const *) (BASE), \
+ (__v2di)(__m128i) (INDEX), \
+ (__v4sf)(__m128) (MASK), \
+ (int) (SCALE))
+
+#define _mm256_i64gather_ps(BASE, INDEX, SCALE) \
+ (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (), \
+ (float const *) (BASE), \
+ (__v4di)(__m256i) (INDEX), \
+ (__v4sf) \
+ _mm_cmpeq_ps (_mm_setzero_ps (),\
+ _mm_setzero_ps ()),\
+ (int) (SCALE))
+
+#define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \
+ (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128) (SRC), \
+ (float const *) (BASE), \
+ (__v4di)(__m256i) (INDEX), \
+ (__v4sf)(__m128) (MASK), \
+ (int) (SCALE))
+
+#define _mm_i32gather_epi64(BASE, INDEX, SCALE) \
+ (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), \
+ (long long const *) (BASE), \
+ (__v4si)(__m128i) (INDEX), \
+ (__v2di)_mm_set1_epi64x (-1), \
+ (int) (SCALE))
+
+#define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
+ (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i) (SRC), \
+ (long long const *) (BASE), \
+ (__v4si)(__m128i) (INDEX), \
+ (__v2di)(__m128i) (MASK), \
+ (int) (SCALE))
+
+#define _mm256_i32gather_epi64(BASE, INDEX, SCALE) \
+ (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), \
+ (long long const *) (BASE), \
+ (__v4si)(__m128i) (INDEX), \
+ (__v4di)_mm256_set1_epi64x (-1), \
+ (int) (SCALE))
+
+#define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
+ (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i) (SRC), \
+ (long long const *) (BASE), \
+ (__v4si)(__m128i) (INDEX), \
+ (__v4di)(__m256i) (MASK), \
+ (int) (SCALE))
+
+#define _mm_i64gather_epi64(BASE, INDEX, SCALE) \
+ (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), \
+ (long long const *) (BASE), \
+ (__v2di)(__m128i) (INDEX), \
+ (__v2di)_mm_set1_epi64x (-1), \
+ (int) (SCALE))
+
+#define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
+ (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i) (SRC), \
+ (long long const *) (BASE), \
+ (__v2di)(__m128i) (INDEX), \
+ (__v2di)(__m128i) (MASK), \
+ (int) (SCALE))
+
+#define _mm256_i64gather_epi64(BASE, INDEX, SCALE) \
+ (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), \
+ (long long const *) (BASE), \
+ (__v4di)(__m256i) (INDEX), \
+ (__v4di)_mm256_set1_epi64x (-1), \
+ (int) (SCALE))
+
+#define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \
+ (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i) (SRC), \
+ (long long const *) (BASE), \
+ (__v4di)(__m256i) (INDEX), \
+ (__v4di)(__m256i) (MASK), \
+ (int) (SCALE))
+
+#define _mm_i32gather_epi32(BASE, INDEX, SCALE) \
+ (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (), \
+ (int const *) (BASE), \
+ (__v4si)(__m128i) (INDEX), \
+ (__v4si)_mm_set1_epi32 (-1), \
+ (int) (SCALE))
+
+#define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
+ (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i) (SRC), \
+ (int const *) (BASE), \
+ (__v4si)(__m128i) (INDEX), \
+ (__v4si)(__m128i) (MASK), \
+ (int) (SCALE))
+
+#define _mm256_i32gather_epi32(BASE, INDEX, SCALE) \
+ (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), \
+ (int const *) (BASE), \
+ (__v8si)(__m256i) (INDEX), \
+ (__v8si)_mm256_set1_epi32 (-1), \
+ (int) (SCALE))
+
+#define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
+ (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i) (SRC), \
+ (int const *) (BASE), \
+ (__v8si)(__m256i) (INDEX), \
+ (__v8si)(__m256i) (MASK), \
+ (int) (SCALE))
+
+#define _mm_i64gather_epi32(BASE, INDEX, SCALE) \
+ (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (), \
+ (int const *) (BASE), \
+ (__v2di)(__m128i) (INDEX), \
+ (__v4si)_mm_set1_epi32 (-1), \
+ (int) (SCALE))
+
+#define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
+ (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i) (SRC), \
+ (int const *) (BASE), \
+ (__v2di)(__m128i) (INDEX), \
+ (__v4si)(__m128i) (MASK), \
+ (int) (SCALE))
+
+#define _mm256_i64gather_epi32(BASE, INDEX, SCALE) \
+ (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), \
+ (int const *) (BASE), \
+ (__v4di)(__m256i) (INDEX), \
+ (__v4si)_mm_set1_epi32(-1), \
+ (int) (SCALE))
+
+#define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \
+ (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i) (SRC), \
+ (int const *) (BASE), \
+ (__v4di)(__m256i) (INDEX), \
+ (__v4si)(__m128i) (MASK), \
+ (int) (SCALE))
+#endif /* __OPTIMIZE__ */
+
+#ifdef __DISABLE_AVX2__
+#undef __DISABLE_AVX2__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX2__ */
+
+#endif /* _AVX2INTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2015-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+# error "Never use <avx5124fmapsintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _AVX5124FMAPSINTRIN_H_INCLUDED
+#define _AVX5124FMAPSINTRIN_H_INCLUDED
+
+#ifndef __AVX5124FMAPS__
+#pragma GCC push_options
+#pragma GCC target("avx5124fmaps")
+#define __DISABLE_AVX5124FMAPS__
+#endif /* __AVX5124FMAPS__ */
+
+extern __inline __m512
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_4fmadd_ps (__m512 __A, __m512 __B, __m512 __C,
+ __m512 __D, __m512 __E, __m128 *__F)
+{
+ return (__m512) __builtin_ia32_4fmaddps ((__v16sf) __B,
+ (__v16sf) __C,
+ (__v16sf) __D,
+ (__v16sf) __E,
+ (__v16sf) __A,
+ (const __v4sf *) __F);
+}
+
+extern __inline __m512
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_4fmadd_ps (__m512 __A, __mmask16 __U, __m512 __B,
+ __m512 __C, __m512 __D, __m512 __E, __m128 *__F)
+{
+ return (__m512) __builtin_ia32_4fmaddps_mask ((__v16sf) __B,
+ (__v16sf) __C,
+ (__v16sf) __D,
+ (__v16sf) __E,
+ (__v16sf) __A,
+ (const __v4sf *) __F,
+ (__v16sf) __A,
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_4fmadd_ps (__mmask16 __U,
+ __m512 __A, __m512 __B, __m512 __C,
+ __m512 __D, __m512 __E, __m128 *__F)
+{
+ return (__m512) __builtin_ia32_4fmaddps_mask ((__v16sf) __B,
+ (__v16sf) __C,
+ (__v16sf) __D,
+ (__v16sf) __E,
+ (__v16sf) __A,
+ (const __v4sf *) __F,
+ (__v16sf) _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_4fmadd_ss (__m128 __A, __m128 __B, __m128 __C,
+ __m128 __D, __m128 __E, __m128 *__F)
+{
+ return (__m128) __builtin_ia32_4fmaddss ((__v4sf) __B,
+ (__v4sf) __C,
+ (__v4sf) __D,
+ (__v4sf) __E,
+ (__v4sf) __A,
+ (const __v4sf *) __F);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_4fmadd_ss (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C,
+ __m128 __D, __m128 __E, __m128 *__F)
+{
+ return (__m128) __builtin_ia32_4fmaddss_mask ((__v4sf) __B,
+ (__v4sf) __C,
+ (__v4sf) __D,
+ (__v4sf) __E,
+ (__v4sf) __A,
+ (const __v4sf *) __F,
+ (__v4sf) __A,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_4fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C,
+ __m128 __D, __m128 __E, __m128 *__F)
+{
+ return (__m128) __builtin_ia32_4fmaddss_mask ((__v4sf) __B,
+ (__v4sf) __C,
+ (__v4sf) __D,
+ (__v4sf) __E,
+ (__v4sf) __A,
+ (const __v4sf *) __F,
+ (__v4sf) _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_4fnmadd_ps (__m512 __A, __m512 __B, __m512 __C,
+ __m512 __D, __m512 __E, __m128 *__F)
+{
+ return (__m512) __builtin_ia32_4fnmaddps ((__v16sf) __B,
+ (__v16sf) __C,
+ (__v16sf) __D,
+ (__v16sf) __E,
+ (__v16sf) __A,
+ (const __v4sf *) __F);
+}
+
+extern __inline __m512
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_4fnmadd_ps (__m512 __A, __mmask16 __U, __m512 __B,
+ __m512 __C, __m512 __D, __m512 __E, __m128 *__F)
+{
+ return (__m512) __builtin_ia32_4fnmaddps_mask ((__v16sf) __B,
+ (__v16sf) __C,
+ (__v16sf) __D,
+ (__v16sf) __E,
+ (__v16sf) __A,
+ (const __v4sf *) __F,
+ (__v16sf) __A,
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_4fnmadd_ps (__mmask16 __U,
+ __m512 __A, __m512 __B, __m512 __C,
+ __m512 __D, __m512 __E, __m128 *__F)
+{
+ return (__m512) __builtin_ia32_4fnmaddps_mask ((__v16sf) __B,
+ (__v16sf) __C,
+ (__v16sf) __D,
+ (__v16sf) __E,
+ (__v16sf) __A,
+ (const __v4sf *) __F,
+ (__v16sf) _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_4fnmadd_ss (__m128 __A, __m128 __B, __m128 __C,
+ __m128 __D, __m128 __E, __m128 *__F)
+{
+ return (__m128) __builtin_ia32_4fnmaddss ((__v4sf) __B,
+ (__v4sf) __C,
+ (__v4sf) __D,
+ (__v4sf) __E,
+ (__v4sf) __A,
+ (const __v4sf *) __F);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_4fnmadd_ss (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C,
+ __m128 __D, __m128 __E, __m128 *__F)
+{
+ return (__m128) __builtin_ia32_4fnmaddss_mask ((__v4sf) __B,
+ (__v4sf) __C,
+ (__v4sf) __D,
+ (__v4sf) __E,
+ (__v4sf) __A,
+ (const __v4sf *) __F,
+ (__v4sf) __A,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_4fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C,
+ __m128 __D, __m128 __E, __m128 *__F)
+{
+ return (__m128) __builtin_ia32_4fnmaddss_mask ((__v4sf) __B,
+ (__v4sf) __C,
+ (__v4sf) __D,
+ (__v4sf) __E,
+ (__v4sf) __A,
+ (const __v4sf *) __F,
+ (__v4sf) _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+#ifdef __DISABLE_AVX5124FMAPS__
+#undef __DISABLE_AVX5124FMAPS__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX5124FMAPS__ */
+
+#endif /* _AVX5124FMAPSINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2015-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+# error "Never use <avx5124vnniwintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _AVX5124VNNIWINTRIN_H_INCLUDED
+#define _AVX5124VNNIWINTRIN_H_INCLUDED
+
+#ifndef __AVX5124VNNIW__
+#pragma GCC push_options
+#pragma GCC target("avx5124vnniw")
+#define __DISABLE_AVX5124VNNIW__
+#endif /* __AVX5124VNNIW__ */
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_4dpwssd_epi32 (__m512i __A, __m512i __B, __m512i __C,
+ __m512i __D, __m512i __E, __m128i *__F)
+{
+ return (__m512i) __builtin_ia32_vp4dpwssd ((__v16si) __B,
+ (__v16si) __C,
+ (__v16si) __D,
+ (__v16si) __E,
+ (__v16si) __A,
+ (const __v4si *) __F);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_4dpwssd_epi32 (__m512i __A, __mmask16 __U, __m512i __B,
+ __m512i __C, __m512i __D, __m512i __E,
+ __m128i *__F)
+{
+ return (__m512i) __builtin_ia32_vp4dpwssd_mask ((__v16si) __B,
+ (__v16si) __C,
+ (__v16si) __D,
+ (__v16si) __E,
+ (__v16si) __A,
+ (const __v4si *) __F,
+ (__v16si) __A,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_4dpwssd_epi32 (__mmask16 __U, __m512i __A, __m512i __B,
+ __m512i __C, __m512i __D, __m512i __E,
+ __m128i *__F)
+{
+ return (__m512i) __builtin_ia32_vp4dpwssd_mask ((__v16si) __B,
+ (__v16si) __C,
+ (__v16si) __D,
+ (__v16si) __E,
+ (__v16si) __A,
+ (const __v4si *) __F,
+ (__v16si) _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_4dpwssds_epi32 (__m512i __A, __m512i __B, __m512i __C,
+ __m512i __D, __m512i __E, __m128i *__F)
+{
+ return (__m512i) __builtin_ia32_vp4dpwssds ((__v16si) __B,
+ (__v16si) __C,
+ (__v16si) __D,
+ (__v16si) __E,
+ (__v16si) __A,
+ (const __v4si *) __F);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_4dpwssds_epi32 (__m512i __A, __mmask16 __U, __m512i __B,
+ __m512i __C, __m512i __D, __m512i __E,
+ __m128i *__F)
+{
+ return (__m512i) __builtin_ia32_vp4dpwssds_mask ((__v16si) __B,
+ (__v16si) __C,
+ (__v16si) __D,
+ (__v16si) __E,
+ (__v16si) __A,
+ (const __v4si *) __F,
+ (__v16si) __A,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_4dpwssds_epi32 (__mmask16 __U, __m512i __A, __m512i __B,
+ __m512i __C, __m512i __D, __m512i __E,
+ __m128i *__F)
+{
+ return (__m512i) __builtin_ia32_vp4dpwssds_mask ((__v16si) __B,
+ (__v16si) __C,
+ (__v16si) __D,
+ (__v16si) __E,
+ (__v16si) __A,
+ (const __v4si *) __F,
+ (__v16si) _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+#ifdef __DISABLE_AVX5124VNNIW__
+#undef __DISABLE_AVX5124VNNIW__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX5124VNNIW__ */
+
+#endif /* _AVX5124VNNIWINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2019-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512BF16INTRIN_H_INCLUDED
+#define _AVX512BF16INTRIN_H_INCLUDED
+
+#ifndef __AVX512BF16__
+#pragma GCC push_options
+#pragma GCC target("avx512bf16")
+#define __DISABLE_AVX512BF16__
+#endif /* __AVX512BF16__ */
+
+/* Internal data types for implementing the intrinsics. */
+typedef __bf16 __v32bf __attribute__ ((__vector_size__ (64)));
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+ vector types, and their scalar components. */
+typedef __bf16 __m512bh __attribute__ ((__vector_size__ (64), __may_alias__));
+
+/* Convert One BF16 Data to One Single Float Data. */
+extern __inline float
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsbh_ss (__bf16 __A)
+{
+ return __builtin_ia32_cvtbf2sf (__A);
+}
+
+/* vcvtne2ps2bf16 */
+
+extern __inline __m512bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtne2ps_pbh (__m512 __A, __m512 __B)
+{
+ return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf(__A, __B);
+}
+
+extern __inline __m512bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtne2ps_pbh (__m512bh __A, __mmask32 __B, __m512 __C, __m512 __D)
+{
+ return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf_mask(__C, __D, __A, __B);
+}
+
+extern __inline __m512bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtne2ps_pbh (__mmask32 __A, __m512 __B, __m512 __C)
+{
+ return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf_maskz(__B, __C, __A);
+}
+
+/* vcvtneps2bf16 */
+
+extern __inline __m256bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtneps_pbh (__m512 __A)
+{
+ return (__m256bh)__builtin_ia32_cvtneps2bf16_v16sf(__A);
+}
+
+extern __inline __m256bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtneps_pbh (__m256bh __A, __mmask16 __B, __m512 __C)
+{
+ return (__m256bh)__builtin_ia32_cvtneps2bf16_v16sf_mask(__C, __A, __B);
+}
+
+extern __inline __m256bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtneps_pbh (__mmask16 __A, __m512 __B)
+{
+ return (__m256bh)__builtin_ia32_cvtneps2bf16_v16sf_maskz(__B, __A);
+}
+
+/* vdpbf16ps */
+
+extern __inline __m512
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_dpbf16_ps (__m512 __A, __m512bh __B, __m512bh __C)
+{
+ return (__m512)__builtin_ia32_dpbf16ps_v16sf(__A, __B, __C);
+}
+
+extern __inline __m512
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_dpbf16_ps (__m512 __A, __mmask16 __B, __m512bh __C, __m512bh __D)
+{
+ return (__m512)__builtin_ia32_dpbf16ps_v16sf_mask(__A, __C, __D, __B);
+}
+
+extern __inline __m512
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_dpbf16_ps (__mmask16 __A, __m512 __B, __m512bh __C, __m512bh __D)
+{
+ return (__m512)__builtin_ia32_dpbf16ps_v16sf_maskz(__B, __C, __D, __A);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtpbh_ps (__m256bh __A)
+{
+ return (__m512)_mm512_castsi512_ps ((__m512i)_mm512_slli_epi32 (
+ (__m512i)_mm512_cvtepi16_epi32 ((__m256i)__A), 16));
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtpbh_ps (__mmask16 __U, __m256bh __A)
+{
+ return (__m512)_mm512_castsi512_ps ((__m512i) _mm512_slli_epi32 (
+ (__m512i)_mm512_maskz_cvtepi16_epi32 (
+ (__mmask16)__U, (__m256i)__A), 16));
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtpbh_ps (__m512 __S, __mmask16 __U, __m256bh __A)
+{
+ return (__m512)_mm512_castsi512_ps ((__m512i)(_mm512_mask_slli_epi32 (
+ (__m512i)__S, (__mmask16)__U,
+ (__m512i)_mm512_cvtepi16_epi32 ((__m256i)__A), 16)));
+}
+
+#ifdef __DISABLE_AVX512BF16__
+#undef __DISABLE_AVX512BF16__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512BF16__ */
+
+#endif /* _AVX512BF16INTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2019-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512bf16vlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512BF16VLINTRIN_H_INCLUDED
+#define _AVX512BF16VLINTRIN_H_INCLUDED
+
+#if !defined(__AVX512VL__) || !defined(__AVX512BF16__)
+#pragma GCC push_options
+#pragma GCC target("avx512bf16,avx512vl")
+#define __DISABLE_AVX512BF16VL__
+#endif /* __AVX512BF16__ */
+
+/* Internal data types for implementing the intrinsics. */
+typedef __bf16 __v16bf __attribute__ ((__vector_size__ (32)));
+typedef __bf16 __v8bf __attribute__ ((__vector_size__ (16)));
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+ vector types, and their scalar components. */
+typedef __bf16 __m256bh __attribute__ ((__vector_size__ (32), __may_alias__));
+typedef __bf16 __m128bh __attribute__ ((__vector_size__ (16), __may_alias__));
+
+typedef __bf16 __bfloat16;
+
+#define _mm256_cvtneps_pbh(A) \
+ (__m128bh) __builtin_ia32_cvtneps2bf16_v8sf (A)
+#define _mm_cvtneps_pbh(A) \
+ (__m128bh) __builtin_ia32_cvtneps2bf16_v4sf (A)
+
+/* vcvtne2ps2bf16 */
+
+extern __inline __m256bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtne2ps_pbh (__m256 __A, __m256 __B)
+{
+ return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf(__A, __B);
+}
+
+extern __inline __m256bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtne2ps_pbh (__m256bh __A, __mmask16 __B, __m256 __C, __m256 __D)
+{
+ return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf_mask(__C, __D, __A, __B);
+}
+
+extern __inline __m256bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtne2ps_pbh (__mmask16 __A, __m256 __B, __m256 __C)
+{
+ return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf_maskz(__B, __C, __A);
+}
+
+extern __inline __m128bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtne2ps_pbh (__m128 __A, __m128 __B)
+{
+ return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf(__A, __B);
+}
+
+extern __inline __m128bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtne2ps_pbh (__m128bh __A, __mmask8 __B, __m128 __C, __m128 __D)
+{
+ return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf_mask(__C, __D, __A, __B);
+}
+
+extern __inline __m128bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtne2ps_pbh (__mmask8 __A, __m128 __B, __m128 __C)
+{
+ return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf_maskz(__B, __C, __A);
+}
+
+/* vcvtneps2bf16 */
+
+extern __inline __m128bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtneps_pbh (__m128bh __A, __mmask8 __B, __m256 __C)
+{
+ return (__m128bh)__builtin_ia32_cvtneps2bf16_v8sf_mask(__C, __A, __B);
+}
+
+extern __inline __m128bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtneps_pbh (__mmask8 __A, __m256 __B)
+{
+ return (__m128bh)__builtin_ia32_cvtneps2bf16_v8sf_maskz(__B, __A);
+}
+
+extern __inline __m128bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtneps_pbh (__m128bh __A, __mmask8 __B, __m128 __C)
+{
+ return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf_mask(__C, __A, __B);
+}
+
+extern __inline __m128bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtneps_pbh (__mmask8 __A, __m128 __B)
+{
+ return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf_maskz(__B, __A);
+}
+
+/* vdpbf16ps */
+
+extern __inline __m256
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_dpbf16_ps (__m256 __A, __m256bh __B, __m256bh __C)
+{
+ return (__m256)__builtin_ia32_dpbf16ps_v8sf(__A, __B, __C);
+}
+
+extern __inline __m256
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_dpbf16_ps (__m256 __A, __mmask8 __B, __m256bh __C, __m256bh __D)
+{
+ return (__m256)__builtin_ia32_dpbf16ps_v8sf_mask(__A, __C, __D, __B);
+}
+
+extern __inline __m256
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_dpbf16_ps (__mmask8 __A, __m256 __B, __m256bh __C, __m256bh __D)
+{
+ return (__m256)__builtin_ia32_dpbf16ps_v8sf_maskz(__B, __C, __D, __A);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_dpbf16_ps (__m128 __A, __m128bh __B, __m128bh __C)
+{
+ return (__m128)__builtin_ia32_dpbf16ps_v4sf(__A, __B, __C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_dpbf16_ps (__m128 __A, __mmask8 __B, __m128bh __C, __m128bh __D)
+{
+ return (__m128)__builtin_ia32_dpbf16ps_v4sf_mask(__A, __C, __D, __B);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_dpbf16_ps (__mmask8 __A, __m128 __B, __m128bh __C, __m128bh __D)
+{
+ return (__m128)__builtin_ia32_dpbf16ps_v4sf_maskz(__B, __C, __D, __A);
+}
+
+extern __inline __bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtness_sbh (float __A)
+{
+ __v4sf __V = {__A, 0, 0, 0};
+ __v8bf __R = __builtin_ia32_cvtneps2bf16_v4sf_mask ((__v4sf)__V,
+ (__v8bf)_mm_undefined_si128 (), (__mmask8)-1);
+ return __R[0];
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpbh_ps (__m128bh __A)
+{
+ return (__m128)_mm_castsi128_ps ((__m128i)_mm_slli_epi32 (
+ (__m128i)_mm_cvtepi16_epi32 ((__m128i)__A), 16));
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtpbh_ps (__m128bh __A)
+{
+ return (__m256)_mm256_castsi256_ps ((__m256i)_mm256_slli_epi32 (
+ (__m256i)_mm256_cvtepi16_epi32 ((__m128i)__A), 16));
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtpbh_ps (__mmask8 __U, __m128bh __A)
+{
+ return (__m128)_mm_castsi128_ps ((__m128i)_mm_slli_epi32 (
+ (__m128i)_mm_maskz_cvtepi16_epi32 (
+ (__mmask8)__U, (__m128i)__A), 16));
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtpbh_ps (__mmask8 __U, __m128bh __A)
+{
+ return (__m256)_mm256_castsi256_ps ((__m256i)_mm256_slli_epi32 (
+ (__m256i)_mm256_maskz_cvtepi16_epi32 (
+ (__mmask8)__U, (__m128i)__A), 16));
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtpbh_ps (__m128 __S, __mmask8 __U, __m128bh __A)
+{
+ return (__m128)_mm_castsi128_ps ((__m128i)_mm_mask_slli_epi32 (
+ (__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32 (
+ (__m128i)__A), 16));
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtpbh_ps (__m256 __S, __mmask8 __U, __m128bh __A)
+{
+ return (__m256)_mm256_castsi256_ps ((__m256i)_mm256_mask_slli_epi32 (
+ (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32 (
+ (__m128i)__A), 16));
+}
+
+#ifdef __DISABLE_AVX512BF16VL__
+#undef __DISABLE_AVX512BF16VL__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512BF16VL__ */
+
+#endif /* _AVX512BF16VLINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2017-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+# error "Never use <avx512bitalgintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _AVX512BITALGINTRIN_H_INCLUDED
+#define _AVX512BITALGINTRIN_H_INCLUDED
+
+#ifndef __AVX512BITALG__
+#pragma GCC push_options
+#pragma GCC target("avx512bitalg")
+#define __DISABLE_AVX512BITALG__
+#endif /* __AVX512BITALG__ */
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_popcnt_epi8 (__m512i __A)
+{
+ return (__m512i) __builtin_ia32_vpopcountb_v64qi ((__v64qi) __A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_popcnt_epi16 (__m512i __A)
+{
+ return (__m512i) __builtin_ia32_vpopcountw_v32hi ((__v32hi) __A);
+}
+
+#ifdef __DISABLE_AVX512BITALG__
+#undef __DISABLE_AVX512BITALG__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512BITALG__ */
+
+#if !defined(__AVX512BITALG__) || !defined(__AVX512BW__)
+#pragma GCC push_options
+#pragma GCC target("avx512bitalg,avx512bw")
+#define __DISABLE_AVX512BITALGBW__
+#endif /* __AVX512VLBW__ */
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_popcnt_epi8 (__m512i __W, __mmask64 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_vpopcountb_v64qi_mask ((__v64qi) __A,
+ (__v64qi) __W,
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_popcnt_epi8 (__mmask64 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_vpopcountb_v64qi_mask ((__v64qi) __A,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) __U);
+}
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_popcnt_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_vpopcountw_v32hi_mask ((__v32hi) __A,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_popcnt_epi16 (__mmask32 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_vpopcountw_v32hi_mask ((__v32hi) __A,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+extern __inline __mmask64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_bitshuffle_epi64_mask (__m512i __A, __m512i __B)
+{
+ return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__mmask64) -1);
+}
+
+extern __inline __mmask64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_bitshuffle_epi64_mask (__mmask64 __M, __m512i __A, __m512i __B)
+{
+ return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__mmask64) __M);
+}
+
+#ifdef __DISABLE_AVX512BITALGBW__
+#undef __DISABLE_AVX512BITALGBW__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512BITALGBW__ */
+
+#if !defined(__AVX512BITALG__) || !defined(__AVX512VL__) || !defined(__AVX512BW__)
+#pragma GCC push_options
+#pragma GCC target("avx512bitalg,avx512vl,avx512bw")
+#define __DISABLE_AVX512BITALGVLBW__
+#endif /* __AVX512VLBW__ */
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_popcnt_epi8 (__m256i __W, __mmask32 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_vpopcountb_v32qi_mask ((__v32qi) __A,
+ (__v32qi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_popcnt_epi8 (__mmask32 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_vpopcountb_v32qi_mask ((__v32qi) __A,
+ (__v32qi)
+ _mm256_setzero_si256 (),
+ (__mmask32) __U);
+}
+
+extern __inline __mmask32
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_bitshuffle_epi64_mask (__m256i __A, __m256i __B)
+{
+ return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_bitshuffle_epi64_mask (__mmask32 __M, __m256i __A, __m256i __B)
+{
+ return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__mmask32) __M);
+}
+
+#ifdef __DISABLE_AVX512BITALGVLBW__
+#undef __DISABLE_AVX512BITALGVLBW__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512BITALGVLBW__ */
+
+
+#if !defined(__AVX512BITALG__) || !defined(__AVX512VL__)
+#pragma GCC push_options
+#pragma GCC target("avx512bitalg,avx512vl")
+#define __DISABLE_AVX512BITALGVL__
+#endif /* __AVX512VLBW__ */
+
+extern __inline __mmask16
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_bitshuffle_epi64_mask (__m128i __A, __m128i __B)
+{
+ return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_bitshuffle_epi64_mask (__mmask16 __M, __m128i __A, __m128i __B)
+{
+ return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__mmask16) __M);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_popcnt_epi8 (__m256i __A)
+{
+ return (__m256i) __builtin_ia32_vpopcountb_v32qi ((__v32qi) __A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_popcnt_epi16 (__m256i __A)
+{
+ return (__m256i) __builtin_ia32_vpopcountw_v16hi ((__v16hi) __A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_popcnt_epi8 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_vpopcountb_v16qi ((__v16qi) __A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_popcnt_epi16 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_vpopcountw_v8hi ((__v8hi) __A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_popcnt_epi16 (__m256i __W, __mmask16 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_vpopcountw_v16hi_mask ((__v16hi) __A,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_popcnt_epi16 (__mmask16 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_vpopcountw_v16hi_mask ((__v16hi) __A,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_popcnt_epi8 (__m128i __W, __mmask16 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_vpopcountb_v16qi_mask ((__v16qi) __A,
+ (__v16qi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_popcnt_epi8 (__mmask16 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_vpopcountb_v16qi_mask ((__v16qi) __A,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ (__mmask16) __U);
+}
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_popcnt_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_vpopcountw_v8hi_mask ((__v8hi) __A,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_popcnt_epi16 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_vpopcountw_v8hi_mask ((__v8hi) __A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+#ifdef __DISABLE_AVX512BITALGVL__
+#undef __DISABLE_AVX512BITALGVL__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512BITALGBW__ */
+
+#endif /* _AVX512BITALGINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2014-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512bwintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512BWINTRIN_H_INCLUDED
+#define _AVX512BWINTRIN_H_INCLUDED
+
+#ifndef __AVX512BW__
+#pragma GCC push_options
+#pragma GCC target("avx512bw")
+#define __DISABLE_AVX512BW__
+#endif /* __AVX512BW__ */
+
+/* Internal data types for implementing the intrinsics. */
+typedef short __v32hi __attribute__ ((__vector_size__ (64)));
+typedef short __v32hi_u __attribute__ ((__vector_size__ (64), \
+ __may_alias__, __aligned__ (1)));
+typedef char __v64qi __attribute__ ((__vector_size__ (64)));
+typedef char __v64qi_u __attribute__ ((__vector_size__ (64), \
+ __may_alias__, __aligned__ (1)));
+
+typedef unsigned long long __mmask64;
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_ktest_mask32_u8 (__mmask32 __A, __mmask32 __B, unsigned char *__CF)
+{
+ *__CF = (unsigned char) __builtin_ia32_ktestcsi (__A, __B);
+ return (unsigned char) __builtin_ia32_ktestzsi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_ktest_mask64_u8 (__mmask64 __A, __mmask64 __B, unsigned char *__CF)
+{
+ *__CF = (unsigned char) __builtin_ia32_ktestcdi (__A, __B);
+ return (unsigned char) __builtin_ia32_ktestzdi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_ktestz_mask32_u8 (__mmask32 __A, __mmask32 __B)
+{
+ return (unsigned char) __builtin_ia32_ktestzsi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_ktestz_mask64_u8 (__mmask64 __A, __mmask64 __B)
+{
+ return (unsigned char) __builtin_ia32_ktestzdi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_ktestc_mask32_u8 (__mmask32 __A, __mmask32 __B)
+{
+ return (unsigned char) __builtin_ia32_ktestcsi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_ktestc_mask64_u8 (__mmask64 __A, __mmask64 __B)
+{
+ return (unsigned char) __builtin_ia32_ktestcdi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kortest_mask32_u8 (__mmask32 __A, __mmask32 __B, unsigned char *__CF)
+{
+ *__CF = (unsigned char) __builtin_ia32_kortestcsi (__A, __B);
+ return (unsigned char) __builtin_ia32_kortestzsi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kortest_mask64_u8 (__mmask64 __A, __mmask64 __B, unsigned char *__CF)
+{
+ *__CF = (unsigned char) __builtin_ia32_kortestcdi (__A, __B);
+ return (unsigned char) __builtin_ia32_kortestzdi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kortestz_mask32_u8 (__mmask32 __A, __mmask32 __B)
+{
+ return (unsigned char) __builtin_ia32_kortestzsi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kortestz_mask64_u8 (__mmask64 __A, __mmask64 __B)
+{
+ return (unsigned char) __builtin_ia32_kortestzdi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kortestc_mask32_u8 (__mmask32 __A, __mmask32 __B)
+{
+ return (unsigned char) __builtin_ia32_kortestcsi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kortestc_mask64_u8 (__mmask64 __A, __mmask64 __B)
+{
+ return (unsigned char) __builtin_ia32_kortestcdi (__A, __B);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kadd_mask32 (__mmask32 __A, __mmask32 __B)
+{
+ return (__mmask32) __builtin_ia32_kaddsi ((__mmask32) __A, (__mmask32) __B);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kadd_mask64 (__mmask64 __A, __mmask64 __B)
+{
+ return (__mmask64) __builtin_ia32_kadddi ((__mmask64) __A, (__mmask64) __B);
+}
+
+extern __inline unsigned int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_cvtmask32_u32 (__mmask32 __A)
+{
+ return (unsigned int) __builtin_ia32_kmovd ((__mmask32) __A);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_cvtmask64_u64 (__mmask64 __A)
+{
+ return (unsigned long long) __builtin_ia32_kmovq ((__mmask64) __A);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_cvtu32_mask32 (unsigned int __A)
+{
+ return (__mmask32) __builtin_ia32_kmovd ((__mmask32) __A);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_cvtu64_mask64 (unsigned long long __A)
+{
+ return (__mmask64) __builtin_ia32_kmovq ((__mmask64) __A);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_load_mask32 (__mmask32 *__A)
+{
+ return (__mmask32) __builtin_ia32_kmovd (*__A);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_load_mask64 (__mmask64 *__A)
+{
+ return (__mmask64) __builtin_ia32_kmovq (*(__mmask64 *) __A);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_store_mask32 (__mmask32 *__A, __mmask32 __B)
+{
+ *(__mmask32 *) __A = __builtin_ia32_kmovd (__B);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_store_mask64 (__mmask64 *__A, __mmask64 __B)
+{
+ *(__mmask64 *) __A = __builtin_ia32_kmovq (__B);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_knot_mask32 (__mmask32 __A)
+{
+ return (__mmask32) __builtin_ia32_knotsi ((__mmask32) __A);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_knot_mask64 (__mmask64 __A)
+{
+ return (__mmask64) __builtin_ia32_knotdi ((__mmask64) __A);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kor_mask32 (__mmask32 __A, __mmask32 __B)
+{
+ return (__mmask32) __builtin_ia32_korsi ((__mmask32) __A, (__mmask32) __B);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kor_mask64 (__mmask64 __A, __mmask64 __B)
+{
+ return (__mmask64) __builtin_ia32_kordi ((__mmask64) __A, (__mmask64) __B);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kxnor_mask32 (__mmask32 __A, __mmask32 __B)
+{
+ return (__mmask32) __builtin_ia32_kxnorsi ((__mmask32) __A, (__mmask32) __B);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kxnor_mask64 (__mmask64 __A, __mmask64 __B)
+{
+ return (__mmask64) __builtin_ia32_kxnordi ((__mmask64) __A, (__mmask64) __B);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kxor_mask32 (__mmask32 __A, __mmask32 __B)
+{
+ return (__mmask32) __builtin_ia32_kxorsi ((__mmask32) __A, (__mmask32) __B);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kxor_mask64 (__mmask64 __A, __mmask64 __B)
+{
+ return (__mmask64) __builtin_ia32_kxordi ((__mmask64) __A, (__mmask64) __B);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kand_mask32 (__mmask32 __A, __mmask32 __B)
+{
+ return (__mmask32) __builtin_ia32_kandsi ((__mmask32) __A, (__mmask32) __B);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kand_mask64 (__mmask64 __A, __mmask64 __B)
+{
+ return (__mmask64) __builtin_ia32_kanddi ((__mmask64) __A, (__mmask64) __B);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kandn_mask32 (__mmask32 __A, __mmask32 __B)
+{
+ return (__mmask32) __builtin_ia32_kandnsi ((__mmask32) __A, (__mmask32) __B);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kandn_mask64 (__mmask64 __A, __mmask64 __B)
+{
+ return (__mmask64) __builtin_ia32_kandndi ((__mmask64) __A, (__mmask64) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_movdquhi512_mask ((__v32hi) __A,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mov_epi16 (__mmask32 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_movdquhi512_mask ((__v32hi) __A,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_loadu_epi16 (void const *__P)
+{
+ return (__m512i) (*(__v32hi_u *) __P);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_loadu_epi16 (__m512i __W, __mmask32 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_loaddquhi512_mask ((const short *) __P,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_loadu_epi16 (__mmask32 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_loaddquhi512_mask ((const short *) __P,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_storeu_epi16 (void *__P, __m512i __A)
+{
+ *(__v32hi_u *) __P = (__v32hi_u) __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_storeu_epi16 (void *__P, __mmask32 __U, __m512i __A)
+{
+ __builtin_ia32_storedquhi512_mask ((short *) __P,
+ (__v32hi) __A,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mov_epi8 (__m512i __W, __mmask64 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_movdquqi512_mask ((__v64qi) __A,
+ (__v64qi) __W,
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_movdquqi512_mask ((__v64qi) __A,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_kunpackw (__mmask32 __A, __mmask32 __B)
+{
+ return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A,
+ (__mmask32) __B);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kunpackw_mask32 (__mmask16 __A, __mmask16 __B)
+{
+ return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A,
+ (__mmask32) __B);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_kunpackd (__mmask64 __A, __mmask64 __B)
+{
+ return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A,
+ (__mmask64) __B);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kunpackd_mask64 (__mmask32 __A, __mmask32 __B)
+{
+ return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A,
+ (__mmask64) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_loadu_epi8 (void const *__P)
+{
+ return (__m512i) (*(__v64qi_u *) __P);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_loadu_epi8 (__m512i __W, __mmask64 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_loaddquqi512_mask ((const char *) __P,
+ (__v64qi) __W,
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_loadu_epi8 (__mmask64 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_loaddquqi512_mask ((const char *) __P,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_storeu_epi8 (void *__P, __m512i __A)
+{
+ *(__v64qi_u *) __P = (__v64qi_u) __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_storeu_epi8 (void *__P, __mmask64 __U, __m512i __A)
+{
+ __builtin_ia32_storedquqi512_mask ((char *) __P,
+ (__v64qi) __A,
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sad_epu8 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psadbw512 ((__v64qi) __A,
+ (__v64qi) __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi16_epi8 (__m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
+ (__v32qi) _mm256_undefined_si256(),
+ (__mmask32) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
+{
+ __builtin_ia32_pmovwb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
+ (__v32qi) __O, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi16_epi8 (__mmask32 __M, __m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
+ (__v32qi)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtsepi16_epi8 (__m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
+ (__v32qi)_mm256_undefined_si256(),
+ (__mmask32) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
+{
+ __builtin_ia32_pmovswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
+ (__v32qi)__O,
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtsepi16_epi8 (__mmask32 __M, __m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
+ (__v32qi)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtusepi16_epi8 (__m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
+ (__v32qi)_mm256_undefined_si256(),
+ (__mmask32) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
+ (__v32qi) __O,
+ __M);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
+{
+ __builtin_ia32_pmovuswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtusepi16_epi8 (__mmask32 __M, __m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
+ (__v32qi)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcastb_epi8 (__m128i __A)
+{
+ return (__m512i) __builtin_ia32_pbroadcastb512_mask ((__v16qi) __A,
+ (__v64qi)_mm512_undefined_epi32(),
+ (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcastb_epi8 (__m512i __O, __mmask64 __M, __m128i __A)
+{
+ return (__m512i) __builtin_ia32_pbroadcastb512_mask ((__v16qi) __A,
+ (__v64qi) __O,
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcastb_epi8 (__mmask64 __M, __m128i __A)
+{
+ return (__m512i) __builtin_ia32_pbroadcastb512_mask ((__v16qi) __A,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A)
+{
+ return (__m512i) __builtin_ia32_pbroadcastb512_gpr_mask (__A,
+ (__v64qi) __O,
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_set1_epi8 (__mmask64 __M, char __A)
+{
+ return (__m512i)
+ __builtin_ia32_pbroadcastb512_gpr_mask (__A,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcastw_epi16 (__m128i __A)
+{
+ return (__m512i) __builtin_ia32_pbroadcastw512_mask ((__v8hi) __A,
+ (__v32hi)_mm512_undefined_epi32(),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcastw_epi16 (__m512i __O, __mmask32 __M, __m128i __A)
+{
+ return (__m512i) __builtin_ia32_pbroadcastw512_mask ((__v8hi) __A,
+ (__v32hi) __O,
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcastw_epi16 (__mmask32 __M, __m128i __A)
+{
+ return (__m512i) __builtin_ia32_pbroadcastw512_mask ((__v8hi) __A,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A)
+{
+ return (__m512i) __builtin_ia32_pbroadcastw512_gpr_mask (__A,
+ (__v32hi) __O,
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_set1_epi16 (__mmask32 __M, short __A)
+{
+ return (__m512i)
+ __builtin_ia32_pbroadcastw512_gpr_mask (__A,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mulhrs_epi16 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mulhrs_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mulhrs_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mulhi_epi16 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mulhi_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mulhi_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mulhi_epu16 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mulhi_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mulhi_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mullo_epi16 (__m512i __A, __m512i __B)
+{
+ return (__m512i) ((__v32hu) __A * (__v32hu) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mullo_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mullo_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi8_epi16 (__m256i __A)
+{
+ return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi8_epi16 (__m512i __W, __mmask32 __U, __m256i __A)
+{
+ return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi8_epi16 (__mmask32 __U, __m256i __A)
+{
+ return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu8_epi16 (__m256i __A)
+{
+ return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu8_epi16 (__m512i __W, __mmask32 __U, __m256i __A)
+{
+ return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu8_epi16 (__mmask32 __U, __m256i __A)
+{
+ return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutexvar_epi16 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
+ (__v32hi) __A,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutexvar_epi16 (__mmask32 __M, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
+ (__v32hi) __A,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutexvar_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
+ (__v32hi) __A,
+ (__v32hi) __W,
+ (__mmask32) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutex2var_epi16 (__m512i __A, __m512i __I, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_vpermt2varhi512_mask ((__v32hi) __I
+ /* idx */ ,
+ (__v32hi) __A,
+ (__v32hi) __B,
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutex2var_epi16 (__m512i __A, __mmask32 __U,
+ __m512i __I, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_vpermt2varhi512_mask ((__v32hi) __I
+ /* idx */ ,
+ (__v32hi) __A,
+ (__v32hi) __B,
+ (__mmask32)
+ __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask2_permutex2var_epi16 (__m512i __A, __m512i __I,
+ __mmask32 __U, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_vpermi2varhi512_mask ((__v32hi) __A,
+ (__v32hi) __I
+ /* idx */ ,
+ (__v32hi) __B,
+ (__mmask32)
+ __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutex2var_epi16 (__mmask32 __U, __m512i __A,
+ __m512i __I, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_vpermt2varhi512_maskz ((__v32hi) __I
+ /* idx */ ,
+ (__v32hi) __A,
+ (__v32hi) __B,
+ (__mmask32)
+ __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_avg_epu8 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_avg_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi) __W,
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_avg_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_add_epi8 (__m512i __A, __m512i __B)
+{
+ return (__m512i) ((__v64qu) __A + (__v64qu) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_add_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi) __W,
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_add_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sub_epi8 (__m512i __A, __m512i __B)
+{
+ return (__m512i) ((__v64qu) __A - (__v64qu) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sub_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi) __W,
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sub_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_avg_epu16 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_avg_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_avg_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_subs_epi8 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_subs_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi) __W,
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_subs_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_subs_epu8 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_subs_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi) __W,
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_subs_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_adds_epi8 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_adds_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi) __W,
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_adds_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_adds_epu8 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_adds_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi) __W,
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_adds_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sub_epi16 (__m512i __A, __m512i __B)
+{
+ return (__m512i) ((__v32hu) __A - (__v32hu) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sub_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sub_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_subs_epi16 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_subs_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_subs_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_subs_epu16 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_subs_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_add_epi16 (__m512i __A, __m512i __B)
+{
+ return (__m512i) ((__v32hu) __A + (__v32hu) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_add_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_add_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_adds_epi16 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_adds_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_adds_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_adds_epu16 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_adds_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srl_epi16 (__m512i __A, __m128i __B)
+{
+ return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A,
+ (__v8hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srl_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+ __m128i __B)
+{
+ return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A,
+ (__v8hi) __B,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srl_epi16 (__mmask32 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A,
+ (__v8hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_packs_epi16 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sll_epi16 (__m512i __A, __m128i __B)
+{
+ return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A,
+ (__v8hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sll_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+ __m128i __B)
+{
+ return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A,
+ (__v8hi) __B,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sll_epi16 (__mmask32 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A,
+ (__v8hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maddubs_epi16 (__m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X,
+ (__v64qi) __Y,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_maddubs_epi16 (__m512i __W, __mmask32 __U, __m512i __X,
+ __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X,
+ (__v64qi) __Y,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_maddubs_epi16 (__mmask32 __U, __m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X,
+ (__v64qi) __Y,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_madd_epi16 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_madd_epi16 (__m512i __W, __mmask16 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_madd_epi16 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_unpackhi_epi8 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_unpackhi_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi) __W,
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_unpackhi_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_unpackhi_epi16 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_unpackhi_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_unpackhi_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_unpacklo_epi8 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_unpacklo_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi) __W,
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_unpacklo_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_unpacklo_epi16 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_unpacklo_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_unpacklo_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpeq_epu8_mask (__m512i __A, __m512i __B)
+{
+ return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __A,
+ (__v64qi) __B, 0,
+ (__mmask64) -1);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpeq_epi8_mask (__m512i __A, __m512i __B)
+{
+ return (__mmask64) __builtin_ia32_pcmpeqb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__mmask64) -1);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpeq_epu8_mask (__mmask64 __U, __m512i __A, __m512i __B)
+{
+ return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __A,
+ (__v64qi) __B, 0,
+ __U);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpeq_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
+{
+ return (__mmask64) __builtin_ia32_pcmpeqb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpeq_epu16_mask (__m512i __A, __m512i __B)
+{
+ return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __A,
+ (__v32hi) __B, 0,
+ (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpeq_epi16_mask (__m512i __A, __m512i __B)
+{
+ return (__mmask32) __builtin_ia32_pcmpeqw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpeq_epu16_mask (__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __A,
+ (__v32hi) __B, 0,
+ __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpeq_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__mmask32) __builtin_ia32_pcmpeqw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ __U);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpgt_epu8_mask (__m512i __A, __m512i __B)
+{
+ return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __A,
+ (__v64qi) __B, 6,
+ (__mmask64) -1);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpgt_epi8_mask (__m512i __A, __m512i __B)
+{
+ return (__mmask64) __builtin_ia32_pcmpgtb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__mmask64) -1);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpgt_epu8_mask (__mmask64 __U, __m512i __A, __m512i __B)
+{
+ return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __A,
+ (__v64qi) __B, 6,
+ __U);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpgt_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
+{
+ return (__mmask64) __builtin_ia32_pcmpgtb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpgt_epu16_mask (__m512i __A, __m512i __B)
+{
+ return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __A,
+ (__v32hi) __B, 6,
+ (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpgt_epi16_mask (__m512i __A, __m512i __B)
+{
+ return (__mmask32) __builtin_ia32_pcmpgtw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpgt_epu16_mask (__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __A,
+ (__v32hi) __B, 6,
+ __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpgt_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__mmask32) __builtin_ia32_pcmpgtw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ __U);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_movepi8_mask (__m512i __A)
+{
+ return (__mmask64) __builtin_ia32_cvtb2mask512 ((__v64qi) __A);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_movepi16_mask (__m512i __A)
+{
+ return (__mmask32) __builtin_ia32_cvtw2mask512 ((__v32hi) __A);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_movm_epi8 (__mmask64 __A)
+{
+ return (__m512i) __builtin_ia32_cvtmask2b512 (__A);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_movm_epi16 (__mmask32 __A)
+{
+ return (__m512i) __builtin_ia32_cvtmask2w512 (__A);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_test_epi8_mask (__m512i __A, __m512i __B)
+{
+ return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A,
+ (__v64qi) __B,
+ (__mmask64) -1);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_test_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
+{
+ return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A,
+ (__v64qi) __B, __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_test_epi16_mask (__m512i __A, __m512i __B)
+{
+ return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A,
+ (__v32hi) __B,
+ (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_test_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A,
+ (__v32hi) __B, __U);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_testn_epi8_mask (__m512i __A, __m512i __B)
+{
+ return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A,
+ (__v64qi) __B,
+ (__mmask64) -1);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_testn_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
+{
+ return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A,
+ (__v64qi) __B, __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_testn_epi16_mask (__m512i __A, __m512i __B)
+{
+ return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A,
+ (__v32hi) __B,
+ (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_testn_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A,
+ (__v32hi) __B, __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shuffle_epi8 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shuffle_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi) __W,
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shuffle_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_epu16 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_epu16 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_epu16 (__m512i __W, __mmask32 __M, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi) __W,
+ (__mmask32) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_epi16 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_epi16 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi) __W,
+ (__mmask32) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_epu8 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_epu8 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_epu8 (__m512i __W, __mmask64 __M, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi) __W,
+ (__mmask64) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_epi8 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_epi8 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi) __W,
+ (__mmask64) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_epu8 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_epu8 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_epu8 (__m512i __W, __mmask64 __M, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi) __W,
+ (__mmask64) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_epi8 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_epi8 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ (__v64qi) __W,
+ (__mmask64) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_epi16 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_epi16 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi) __W,
+ (__mmask32) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_epu16 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_epu16 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_epu16 (__m512i __W, __mmask32 __M, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi) __W,
+ (__mmask32) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sra_epi16 (__m512i __A, __m128i __B)
+{
+ return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A,
+ (__v8hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sra_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+ __m128i __B)
+{
+ return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A,
+ (__v8hi) __B,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sra_epi16 (__mmask32 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A,
+ (__v8hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srav_epi16 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srav_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srav_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srlv_epi16 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srlv_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srlv_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sllv_epi16 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sllv_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sllv_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_packs_epi16 (__m512i __W, __mmask64 __M, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v64qi) __W,
+ (__mmask64) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_packs_epi16 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_packus_epi16 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_packus_epi16 (__m512i __W, __mmask64 __M, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v64qi) __W,
+ (__mmask64) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_packus_epi16 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
+ (__v32hi) __B,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_abs_epi8 (__m512i __A)
+{
+ return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_abs_epi8 (__m512i __W, __mmask64 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A,
+ (__v64qi) __W,
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_abs_epi8 (__mmask64 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_abs_epi16 (__m512i __A)
+{
+ return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_abs_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpneq_epu8_mask (__mmask64 __M, __m512i __X, __m512i __Y)
+{
+ return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
+ (__v64qi) __Y, 4,
+ (__mmask64) __M);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmplt_epu8_mask (__mmask64 __M, __m512i __X, __m512i __Y)
+{
+ return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
+ (__v64qi) __Y, 1,
+ (__mmask64) __M);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpge_epu8_mask (__mmask64 __M, __m512i __X, __m512i __Y)
+{
+ return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
+ (__v64qi) __Y, 5,
+ (__mmask64) __M);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmple_epu8_mask (__mmask64 __M, __m512i __X, __m512i __Y)
+{
+ return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
+ (__v64qi) __Y, 2,
+ (__mmask64) __M);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpneq_epu16_mask (__mmask32 __M, __m512i __X, __m512i __Y)
+{
+ return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
+ (__v32hi) __Y, 4,
+ (__mmask32) __M);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmplt_epu16_mask (__mmask32 __M, __m512i __X, __m512i __Y)
+{
+ return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
+ (__v32hi) __Y, 1,
+ (__mmask32) __M);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpge_epu16_mask (__mmask32 __M, __m512i __X, __m512i __Y)
+{
+ return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
+ (__v32hi) __Y, 5,
+ (__mmask32) __M);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmple_epu16_mask (__mmask32 __M, __m512i __X, __m512i __Y)
+{
+ return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
+ (__v32hi) __Y, 2,
+ (__mmask32) __M);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpneq_epi8_mask (__mmask64 __M, __m512i __X, __m512i __Y)
+{
+ return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
+ (__v64qi) __Y, 4,
+ (__mmask64) __M);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmplt_epi8_mask (__mmask64 __M, __m512i __X, __m512i __Y)
+{
+ return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
+ (__v64qi) __Y, 1,
+ (__mmask64) __M);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpge_epi8_mask (__mmask64 __M, __m512i __X, __m512i __Y)
+{
+ return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
+ (__v64qi) __Y, 5,
+ (__mmask64) __M);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmple_epi8_mask (__mmask64 __M, __m512i __X, __m512i __Y)
+{
+ return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
+ (__v64qi) __Y, 2,
+ (__mmask64) __M);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpneq_epi16_mask (__mmask32 __M, __m512i __X, __m512i __Y)
+{
+ return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
+ (__v32hi) __Y, 4,
+ (__mmask32) __M);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmplt_epi16_mask (__mmask32 __M, __m512i __X, __m512i __Y)
+{
+ return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
+ (__v32hi) __Y, 1,
+ (__mmask32) __M);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpge_epi16_mask (__mmask32 __M, __m512i __X, __m512i __Y)
+{
+ return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
+ (__v32hi) __Y, 5,
+ (__mmask32) __M);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmple_epi16_mask (__mmask32 __M, __m512i __X, __m512i __Y)
+{
+ return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
+ (__v32hi) __Y, 2,
+ (__mmask32) __M);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpneq_epu8_mask (__m512i __X, __m512i __Y)
+{
+ return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
+ (__v64qi) __Y, 4,
+ (__mmask64) -1);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmplt_epu8_mask (__m512i __X, __m512i __Y)
+{
+ return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
+ (__v64qi) __Y, 1,
+ (__mmask64) -1);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpge_epu8_mask (__m512i __X, __m512i __Y)
+{
+ return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
+ (__v64qi) __Y, 5,
+ (__mmask64) -1);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmple_epu8_mask (__m512i __X, __m512i __Y)
+{
+ return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
+ (__v64qi) __Y, 2,
+ (__mmask64) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpneq_epu16_mask (__m512i __X, __m512i __Y)
+{
+ return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
+ (__v32hi) __Y, 4,
+ (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmplt_epu16_mask (__m512i __X, __m512i __Y)
+{
+ return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
+ (__v32hi) __Y, 1,
+ (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpge_epu16_mask (__m512i __X, __m512i __Y)
+{
+ return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
+ (__v32hi) __Y, 5,
+ (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmple_epu16_mask (__m512i __X, __m512i __Y)
+{
+ return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
+ (__v32hi) __Y, 2,
+ (__mmask32) -1);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpneq_epi8_mask (__m512i __X, __m512i __Y)
+{
+ return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
+ (__v64qi) __Y, 4,
+ (__mmask64) -1);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmplt_epi8_mask (__m512i __X, __m512i __Y)
+{
+ return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
+ (__v64qi) __Y, 1,
+ (__mmask64) -1);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpge_epi8_mask (__m512i __X, __m512i __Y)
+{
+ return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
+ (__v64qi) __Y, 5,
+ (__mmask64) -1);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmple_epi8_mask (__m512i __X, __m512i __Y)
+{
+ return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
+ (__v64qi) __Y, 2,
+ (__mmask64) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpneq_epi16_mask (__m512i __X, __m512i __Y)
+{
+ return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
+ (__v32hi) __Y, 4,
+ (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmplt_epi16_mask (__m512i __X, __m512i __Y)
+{
+ return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
+ (__v32hi) __Y, 1,
+ (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpge_epi16_mask (__m512i __X, __m512i __Y)
+{
+ return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
+ (__v32hi) __Y, 5,
+ (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmple_epi16_mask (__m512i __X, __m512i __Y)
+{
+ return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
+ (__v32hi) __Y, 2,
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_packs_epi32 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_packs_epi32 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_packs_epi32 (__m512i __W, __mmask32 __M, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v32hi) __W,
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_packus_epi32 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_packus_epi32 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_packus_epi32 (__m512i __W, __mmask32 __M, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v32hi) __W,
+ __M);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kshiftli_mask32 (__mmask32 __A, unsigned int __B)
+{
+ return (__mmask32) __builtin_ia32_kshiftlisi ((__mmask32) __A,
+ (__mmask8) __B);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kshiftli_mask64 (__mmask64 __A, unsigned int __B)
+{
+ return (__mmask64) __builtin_ia32_kshiftlidi ((__mmask64) __A,
+ (__mmask8) __B);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kshiftri_mask32 (__mmask32 __A, unsigned int __B)
+{
+ return (__mmask32) __builtin_ia32_kshiftrisi ((__mmask32) __A,
+ (__mmask8) __B);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kshiftri_mask64 (__mmask64 __A, unsigned int __B)
+{
+ return (__mmask64) __builtin_ia32_kshiftridi ((__mmask64) __A,
+ (__mmask8) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_alignr_epi8 (__m512i __A, __m512i __B, const int __N)
+{
+ return (__m512i) __builtin_ia32_palignr512 ((__v8di) __A,
+ (__v8di) __B, __N * 8);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_alignr_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+ __m512i __B, const int __N)
+{
+ return (__m512i) __builtin_ia32_palignr512_mask ((__v8di) __A,
+ (__v8di) __B,
+ __N * 8,
+ (__v8di) __W,
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_alignr_epi8 (__mmask64 __U, __m512i __A, __m512i __B,
+ const int __N)
+{
+ return (__m512i) __builtin_ia32_palignr512_mask ((__v8di) __A,
+ (__v8di) __B,
+ __N * 8,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_dbsad_epu8 (__m512i __A, __m512i __B, const int __imm)
+{
+ return (__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ __imm,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_dbsad_epu8 (__m512i __W, __mmask32 __U, __m512i __A,
+ __m512i __B, const int __imm)
+{
+ return (__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ __imm,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_dbsad_epu8 (__mmask32 __U, __m512i __A, __m512i __B,
+ const int __imm)
+{
+ return (__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi) __A,
+ (__v64qi) __B,
+ __imm,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srli_epi16 (__m512i __A, const int __imm)
+{
+ return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srli_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+ const int __imm)
+{
+ return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srli_epi16 (__mmask32 __U, __m512i __A, const int __imm)
+{
+ return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_slli_epi16 (__m512i __A, const int __B)
+{
+ return (__m512i) __builtin_ia32_psllwi512_mask ((__v32hi) __A, __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_slli_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+ const int __B)
+{
+ return (__m512i) __builtin_ia32_psllwi512_mask ((__v32hi) __A, __B,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_slli_epi16 (__mmask32 __U, __m512i __A, const int __B)
+{
+ return (__m512i) __builtin_ia32_psllwi512_mask ((__v32hi) __A, __B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shufflehi_epi16 (__m512i __A, const int __imm)
+{
+ return (__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi) __A,
+ __imm,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shufflehi_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+ const int __imm)
+{
+ return (__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi) __A,
+ __imm,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shufflehi_epi16 (__mmask32 __U, __m512i __A,
+ const int __imm)
+{
+ return (__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi) __A,
+ __imm,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shufflelo_epi16 (__m512i __A, const int __imm)
+{
+ return (__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi) __A,
+ __imm,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shufflelo_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+ const int __imm)
+{
+ return (__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi) __A,
+ __imm,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shufflelo_epi16 (__mmask32 __U, __m512i __A,
+ const int __imm)
+{
+ return (__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi) __A,
+ __imm,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srai_epi16 (__m512i __A, const int __imm)
+{
+ return (__m512i) __builtin_ia32_psrawi512_mask ((__v32hi) __A, __imm,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srai_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+ const int __imm)
+{
+ return (__m512i) __builtin_ia32_psrawi512_mask ((__v32hi) __A, __imm,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srai_epi16 (__mmask32 __U, __m512i __A, const int __imm)
+{
+ return (__m512i) __builtin_ia32_psrawi512_mask ((__v32hi) __A, __imm,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_blend_epi16 (__mmask32 __U, __m512i __A, __m512i __W)
+{
+ return (__m512i) __builtin_ia32_blendmw_512_mask ((__v32hi) __A,
+ (__v32hi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_blend_epi8 (__mmask64 __U, __m512i __A, __m512i __W)
+{
+ return (__m512i) __builtin_ia32_blendmb_512_mask ((__v64qi) __A,
+ (__v64qi) __W,
+ (__mmask64) __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_epi16_mask (__mmask32 __U, __m512i __X, __m512i __Y,
+ const int __P)
+{
+ return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
+ (__v32hi) __Y, __P,
+ (__mmask32) __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_epi16_mask (__m512i __X, __m512i __Y, const int __P)
+{
+ return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
+ (__v32hi) __Y, __P,
+ (__mmask32) -1);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_epi8_mask (__mmask64 __U, __m512i __X, __m512i __Y,
+ const int __P)
+{
+ return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
+ (__v64qi) __Y, __P,
+ (__mmask64) __U);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_epi8_mask (__m512i __X, __m512i __Y, const int __P)
+{
+ return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
+ (__v64qi) __Y, __P,
+ (__mmask64) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_epu16_mask (__mmask32 __U, __m512i __X, __m512i __Y,
+ const int __P)
+{
+ return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
+ (__v32hi) __Y, __P,
+ (__mmask32) __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_epu16_mask (__m512i __X, __m512i __Y, const int __P)
+{
+ return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
+ (__v32hi) __Y, __P,
+ (__mmask32) -1);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_epu8_mask (__mmask64 __U, __m512i __X, __m512i __Y,
+ const int __P)
+{
+ return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
+ (__v64qi) __Y, __P,
+ (__mmask64) __U);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_epu8_mask (__m512i __X, __m512i __Y, const int __P)
+{
+ return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
+ (__v64qi) __Y, __P,
+ (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_bslli_epi128 (__m512i __A, const int __N)
+{
+ return (__m512i) __builtin_ia32_pslldq512 (__A, __N * 8);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_bsrli_epi128 (__m512i __A, const int __N)
+{
+ return (__m512i) __builtin_ia32_psrldq512 (__A, __N * 8);
+}
+
+#else
+#define _kshiftli_mask32(X, Y) \
+ ((__mmask32) __builtin_ia32_kshiftlisi ((__mmask32)(X), (__mmask8)(Y)))
+
+#define _kshiftli_mask64(X, Y) \
+ ((__mmask64) __builtin_ia32_kshiftlidi ((__mmask64)(X), (__mmask8)(Y)))
+
+#define _kshiftri_mask32(X, Y) \
+ ((__mmask32) __builtin_ia32_kshiftrisi ((__mmask32)(X), (__mmask8)(Y)))
+
+#define _kshiftri_mask64(X, Y) \
+ ((__mmask64) __builtin_ia32_kshiftridi ((__mmask64)(X), (__mmask8)(Y)))
+
+#define _mm512_alignr_epi8(X, Y, N) \
+ ((__m512i) __builtin_ia32_palignr512 ((__v8di)(__m512i)(X), \
+ (__v8di)(__m512i)(Y), \
+ (int)((N) * 8)))
+
+#define _mm512_mask_alignr_epi8(W, U, X, Y, N) \
+ ((__m512i) __builtin_ia32_palignr512_mask ((__v8di)(__m512i)(X), \
+ (__v8di)(__m512i)(Y), (int)((N) * 8), \
+ (__v8di)(__m512i)(W), (__mmask64)(U)))
+
+#define _mm512_maskz_alignr_epi8(U, X, Y, N) \
+ ((__m512i) __builtin_ia32_palignr512_mask ((__v8di)(__m512i)(X), \
+ (__v8di)(__m512i)(Y), (int)((N) * 8), \
+ (__v8di)(__m512i) \
+ _mm512_setzero_si512 (), \
+ (__mmask64)(U)))
+
+#define _mm512_dbsad_epu8(X, Y, C) \
+ ((__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi)(__m512i) (X), \
+ (__v64qi)(__m512i) (Y), (int) (C), \
+ (__v32hi)(__m512i) \
+ _mm512_setzero_si512 (), \
+ (__mmask32)-1))
+
+#define _mm512_mask_dbsad_epu8(W, U, X, Y, C) \
+ ((__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi)(__m512i) (X), \
+ (__v64qi)(__m512i) (Y), (int) (C), \
+ (__v32hi)(__m512i)(W), \
+ (__mmask32)(U)))
+
+#define _mm512_maskz_dbsad_epu8(U, X, Y, C) \
+ ((__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi)(__m512i) (X), \
+ (__v64qi)(__m512i) (Y), (int) (C), \
+ (__v32hi)(__m512i) \
+ _mm512_setzero_si512 (), \
+ (__mmask32)(U)))
+
+#define _mm512_srli_epi16(A, B) \
+ ((__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi)(__m512i)(A), \
+ (int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)-1))
+
+#define _mm512_mask_srli_epi16(W, U, A, B) \
+ ((__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi)(__m512i)(A), \
+ (int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U)))
+
+#define _mm512_maskz_srli_epi16(U, A, B) \
+ ((__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi)(__m512i)(A), \
+ (int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)(U)))
+
+#define _mm512_slli_epi16(X, C) \
+ ((__m512i)__builtin_ia32_psllwi512_mask ((__v32hi)(__m512i)(X), (int)(C),\
+ (__v32hi)(__m512i)_mm512_setzero_si512 (), \
+ (__mmask32)-1))
+
+#define _mm512_mask_slli_epi16(W, U, X, C) \
+ ((__m512i)__builtin_ia32_psllwi512_mask ((__v32hi)(__m512i)(X), (int)(C),\
+ (__v32hi)(__m512i)(W),\
+ (__mmask32)(U)))
+
+#define _mm512_maskz_slli_epi16(U, X, C) \
+ ((__m512i)__builtin_ia32_psllwi512_mask ((__v32hi)(__m512i)(X), (int)(C),\
+ (__v32hi)(__m512i)_mm512_setzero_si512 (), \
+ (__mmask32)(U)))
+
+#define _mm512_shufflehi_epi16(A, B) \
+ ((__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi)(__m512i)(A), (int)(B), \
+ (__v32hi)(__m512i) \
+ _mm512_setzero_si512 (), \
+ (__mmask32)-1))
+
+#define _mm512_mask_shufflehi_epi16(W, U, A, B) \
+ ((__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi)(__m512i)(A), (int)(B), \
+ (__v32hi)(__m512i)(W), \
+ (__mmask32)(U)))
+
+#define _mm512_maskz_shufflehi_epi16(U, A, B) \
+ ((__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi)(__m512i)(A), (int)(B), \
+ (__v32hi)(__m512i) \
+ _mm512_setzero_si512 (), \
+ (__mmask32)(U)))
+
+#define _mm512_shufflelo_epi16(A, B) \
+ ((__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi)(__m512i)(A), (int)(B), \
+ (__v32hi)(__m512i) \
+ _mm512_setzero_si512 (), \
+ (__mmask32)-1))
+
+#define _mm512_mask_shufflelo_epi16(W, U, A, B) \
+ ((__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi)(__m512i)(A), (int)(B), \
+ (__v32hi)(__m512i)(W), \
+ (__mmask32)(U)))
+
+#define _mm512_maskz_shufflelo_epi16(U, A, B) \
+ ((__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi)(__m512i)(A), (int)(B), \
+ (__v32hi)(__m512i) \
+ _mm512_setzero_si512 (), \
+ (__mmask32)(U)))
+
+#define _mm512_srai_epi16(A, B) \
+ ((__m512i) __builtin_ia32_psrawi512_mask ((__v32hi)(__m512i)(A), \
+ (int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)-1))
+
+#define _mm512_mask_srai_epi16(W, U, A, B) \
+ ((__m512i) __builtin_ia32_psrawi512_mask ((__v32hi)(__m512i)(A), \
+ (int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U)))
+
+#define _mm512_maskz_srai_epi16(U, A, B) \
+ ((__m512i) __builtin_ia32_psrawi512_mask ((__v32hi)(__m512i)(A), \
+ (int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)(U)))
+
+#define _mm512_mask_blend_epi16(__U, __A, __W) \
+ ((__m512i) __builtin_ia32_blendmw_512_mask ((__v32hi) (__A), \
+ (__v32hi) (__W), \
+ (__mmask32) (__U)))
+
+#define _mm512_mask_blend_epi8(__U, __A, __W) \
+ ((__m512i) __builtin_ia32_blendmb_512_mask ((__v64qi) (__A), \
+ (__v64qi) (__W), \
+ (__mmask64) (__U)))
+
+#define _mm512_cmp_epi16_mask(X, Y, P) \
+ ((__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi)(__m512i)(X), \
+ (__v32hi)(__m512i)(Y), (int)(P),\
+ (__mmask32)(-1)))
+
+#define _mm512_cmp_epi8_mask(X, Y, P) \
+ ((__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi)(__m512i)(X), \
+ (__v64qi)(__m512i)(Y), (int)(P),\
+ (__mmask64)(-1)))
+
+#define _mm512_cmp_epu16_mask(X, Y, P) \
+ ((__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi)(__m512i)(X), \
+ (__v32hi)(__m512i)(Y), (int)(P),\
+ (__mmask32)(-1)))
+
+#define _mm512_cmp_epu8_mask(X, Y, P) \
+ ((__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi)(__m512i)(X), \
+ (__v64qi)(__m512i)(Y), (int)(P),\
+ (__mmask64)(-1)))
+
+#define _mm512_mask_cmp_epi16_mask(M, X, Y, P) \
+ ((__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi)(__m512i)(X), \
+ (__v32hi)(__m512i)(Y), (int)(P),\
+ (__mmask32)(M)))
+
+#define _mm512_mask_cmp_epi8_mask(M, X, Y, P) \
+ ((__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi)(__m512i)(X), \
+ (__v64qi)(__m512i)(Y), (int)(P),\
+ (__mmask64)(M)))
+
+#define _mm512_mask_cmp_epu16_mask(M, X, Y, P) \
+ ((__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi)(__m512i)(X), \
+ (__v32hi)(__m512i)(Y), (int)(P),\
+ (__mmask32)(M)))
+
+#define _mm512_mask_cmp_epu8_mask(M, X, Y, P) \
+ ((__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi)(__m512i)(X), \
+ (__v64qi)(__m512i)(Y), (int)(P),\
+ (__mmask64)(M)))
+
+#define _mm512_bslli_epi128(A, N) \
+ ((__m512i)__builtin_ia32_pslldq512 ((__m512i)(A), (int)(N) * 8))
+
+#define _mm512_bsrli_epi128(A, N) \
+ ((__m512i)__builtin_ia32_psrldq512 ((__m512i)(A), (int)(N) * 8))
+
+#endif
+
+#ifdef __DISABLE_AVX512BW__
+#undef __DISABLE_AVX512BW__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512BW__ */
+
+#endif /* _AVX512BWINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2013-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512cdintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512CDINTRIN_H_INCLUDED
+#define _AVX512CDINTRIN_H_INCLUDED
+
+#ifndef __AVX512CD__
+#pragma GCC push_options
+#pragma GCC target("avx512cd")
+#define __DISABLE_AVX512CD__
+#endif /* __AVX512CD__ */
+
+/* Internal data types for implementing the intrinsics. */
+typedef long long __v8di __attribute__ ((__vector_size__ (64)));
+typedef int __v16si __attribute__ ((__vector_size__ (64)));
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+ vector types, and their scalar components. */
+typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__));
+typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__));
+
+typedef unsigned char __mmask8;
+typedef unsigned short __mmask16;
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_conflict_epi32 (__m512i __A)
+{
+ return (__m512i)
+ __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
+ (__v16si) _mm512_setzero_si512 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A)
+{
+ return (__m512i)
+ __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
+ (__v16si) _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_conflict_epi64 (__m512i __A)
+{
+ return (__m512i)
+ __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
+ (__v8di) _mm512_setzero_si512 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A)
+{
+ return (__m512i)
+ __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
+ (__v8di) _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_lzcnt_epi64 (__m512i __A)
+{
+ return (__m512i)
+ __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
+ (__v8di) _mm512_setzero_si512 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A)
+{
+ return (__m512i)
+ __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
+ (__v8di) _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_lzcnt_epi32 (__m512i __A)
+{
+ return (__m512i)
+ __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
+ (__v16si) _mm512_setzero_si512 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A)
+{
+ return (__m512i)
+ __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
+ (__v16si) _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcastmb_epi64 (__mmask8 __A)
+{
+ return (__m512i) __builtin_ia32_broadcastmb512 (__A);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcastmw_epi32 (__mmask16 __A)
+{
+ return (__m512i) __builtin_ia32_broadcastmw512 (__A);
+}
+
+#ifdef __DISABLE_AVX512CD__
+#undef __DISABLE_AVX512CD__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512CD__ */
+
+#endif /* _AVX512CDINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2014-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512dqintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512DQINTRIN_H_INCLUDED
+#define _AVX512DQINTRIN_H_INCLUDED
+
+#ifndef __AVX512DQ__
+#pragma GCC push_options
+#pragma GCC target("avx512dq")
+#define __DISABLE_AVX512DQ__
+#endif /* __AVX512DQ__ */
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_ktest_mask8_u8 (__mmask8 __A, __mmask8 __B, unsigned char *__CF)
+{
+ *__CF = (unsigned char) __builtin_ia32_ktestcqi (__A, __B);
+ return (unsigned char) __builtin_ia32_ktestzqi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_ktestz_mask8_u8 (__mmask8 __A, __mmask8 __B)
+{
+ return (unsigned char) __builtin_ia32_ktestzqi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_ktestc_mask8_u8 (__mmask8 __A, __mmask8 __B)
+{
+ return (unsigned char) __builtin_ia32_ktestcqi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_ktest_mask16_u8 (__mmask16 __A, __mmask16 __B, unsigned char *__CF)
+{
+ *__CF = (unsigned char) __builtin_ia32_ktestchi (__A, __B);
+ return (unsigned char) __builtin_ia32_ktestzhi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_ktestz_mask16_u8 (__mmask16 __A, __mmask16 __B)
+{
+ return (unsigned char) __builtin_ia32_ktestzhi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_ktestc_mask16_u8 (__mmask16 __A, __mmask16 __B)
+{
+ return (unsigned char) __builtin_ia32_ktestchi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kortest_mask8_u8 (__mmask8 __A, __mmask8 __B, unsigned char *__CF)
+{
+ *__CF = (unsigned char) __builtin_ia32_kortestcqi (__A, __B);
+ return (unsigned char) __builtin_ia32_kortestzqi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kortestz_mask8_u8 (__mmask8 __A, __mmask8 __B)
+{
+ return (unsigned char) __builtin_ia32_kortestzqi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kortestc_mask8_u8 (__mmask8 __A, __mmask8 __B)
+{
+ return (unsigned char) __builtin_ia32_kortestcqi (__A, __B);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kadd_mask8 (__mmask8 __A, __mmask8 __B)
+{
+ return (__mmask8) __builtin_ia32_kaddqi ((__mmask8) __A, (__mmask8) __B);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kadd_mask16 (__mmask16 __A, __mmask16 __B)
+{
+ return (__mmask16) __builtin_ia32_kaddhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+extern __inline unsigned int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_cvtmask8_u32 (__mmask8 __A)
+{
+ return (unsigned int) __builtin_ia32_kmovb ((__mmask8 ) __A);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_cvtu32_mask8 (unsigned int __A)
+{
+ return (__mmask8) __builtin_ia32_kmovb ((__mmask8) __A);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_load_mask8 (__mmask8 *__A)
+{
+ return (__mmask8) __builtin_ia32_kmovb (*(__mmask8 *) __A);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_store_mask8 (__mmask8 *__A, __mmask8 __B)
+{
+ *(__mmask8 *) __A = __builtin_ia32_kmovb (__B);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_knot_mask8 (__mmask8 __A)
+{
+ return (__mmask8) __builtin_ia32_knotqi ((__mmask8) __A);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kor_mask8 (__mmask8 __A, __mmask8 __B)
+{
+ return (__mmask8) __builtin_ia32_korqi ((__mmask8) __A, (__mmask8) __B);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kxnor_mask8 (__mmask8 __A, __mmask8 __B)
+{
+ return (__mmask8) __builtin_ia32_kxnorqi ((__mmask8) __A, (__mmask8) __B);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kxor_mask8 (__mmask8 __A, __mmask8 __B)
+{
+ return (__mmask8) __builtin_ia32_kxorqi ((__mmask8) __A, (__mmask8) __B);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kand_mask8 (__mmask8 __A, __mmask8 __B)
+{
+ return (__mmask8) __builtin_ia32_kandqi ((__mmask8) __A, (__mmask8) __B);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kandn_mask8 (__mmask8 __A, __mmask8 __B)
+{
+ return (__mmask8) __builtin_ia32_kandnqi ((__mmask8) __A, (__mmask8) __B);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcast_f64x2 (__m128d __A)
+{
+ return (__m512d)
+ __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A,
+ _mm512_undefined_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcast_f64x2 (__m512d __O, __mmask8 __M, __m128d __A)
+{
+ return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df)
+ __A,
+ (__v8df)
+ __O, __M);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
+{
+ return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df)
+ __A,
+ (__v8df)
+ _mm512_setzero_ps (),
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcast_i64x2 (__m128i __A)
+{
+ return (__m512i)
+ __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A,
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcast_i64x2 (__m512i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di)
+ __A,
+ (__v8di)
+ __O, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
+{
+ return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di)
+ __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ __M);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcast_f32x2 (__m128 __A)
+{
+ return (__m512)
+ __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A,
+ (__v16sf)_mm512_undefined_ps (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcast_f32x2 (__m512 __O, __mmask16 __M, __m128 __A)
+{
+ return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A,
+ (__v16sf)
+ __O, __M);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A)
+{
+ return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcast_i32x2 (__m128i __A)
+{
+ return (__m512i)
+ __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcast_i32x2 (__m512i __O, __mmask16 __M, __m128i __A)
+{
+ return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si)
+ __A,
+ (__v16si)
+ __O, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A)
+{
+ return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si)
+ __A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ __M);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcast_f32x8 (__m256 __A)
+{
+ return (__m512)
+ __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,
+ _mm512_undefined_ps (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcast_f32x8 (__m512 __O, __mmask16 __M, __m256 __A)
+{
+ return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,
+ (__v16sf)__O,
+ __M);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcast_f32x8 (__mmask16 __M, __m256 __A)
+{
+ return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcast_i32x8 (__m256i __A)
+{
+ return (__m512i)
+ __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcast_i32x8 (__m512i __O, __mmask16 __M, __m256i __A)
+{
+ return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si)
+ __A,
+ (__v16si)__O,
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcast_i32x8 (__mmask16 __M, __m256i __A)
+{
+ return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si)
+ __A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mullo_epi64 (__m512i __A, __m512i __B)
+{
+ return (__m512i) ((__v8du) __A * (__v8du) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mullo_epi64 (__m512i __W, __mmask8 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mullo_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_xor_pd (__m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_xor_pd (__m512d __W, __mmask8 __U, __m512d __A,
+ __m512d __B)
+{
+ return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_xor_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_xor_ps (__m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_xor_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_xor_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_or_pd (__m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_or_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_or_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_or_ps (__m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_or_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_or_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_and_pd (__m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_and_pd (__m512d __W, __mmask8 __U, __m512d __A,
+ __m512d __B)
+{
+ return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_and_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_and_ps (__m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_and_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_and_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_andnot_pd (__m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_andnot_pd (__m512d __W, __mmask8 __U, __m512d __A,
+ __m512d __B)
+{
+ return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_andnot_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_andnot_ps (__m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_andnot_ps (__m512 __W, __mmask16 __U, __m512 __A,
+ __m512 __B)
+{
+ return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_andnot_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_movepi32_mask (__m512i __A)
+{
+ return (__mmask16) __builtin_ia32_cvtd2mask512 ((__v16si) __A);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_movepi64_mask (__m512i __A)
+{
+ return (__mmask8) __builtin_ia32_cvtq2mask512 ((__v8di) __A);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_movm_epi32 (__mmask16 __A)
+{
+ return (__m512i) __builtin_ia32_cvtmask2d512 (__A);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_movm_epi64 (__mmask8 __A)
+{
+ return (__m512i) __builtin_ia32_cvtmask2q512 (__A);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttpd_epi64 (__m512d __A)
+{
+ return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A)
+{
+ return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
+ (__v8di) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttpd_epi64 (__mmask8 __U, __m512d __A)
+{
+ return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttpd_epu64 (__m512d __A)
+{
+ return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A)
+{
+ return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
+ (__v8di) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttpd_epu64 (__mmask8 __U, __m512d __A)
+{
+ return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttps_epi64 (__m256 __A)
+{
+ return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttps_epi64 (__m512i __W, __mmask8 __U, __m256 __A)
+{
+ return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
+ (__v8di) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttps_epi64 (__mmask8 __U, __m256 __A)
+{
+ return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttps_epu64 (__m256 __A)
+{
+ return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttps_epu64 (__m512i __W, __mmask8 __U, __m256 __A)
+{
+ return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
+ (__v8di) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttps_epu64 (__mmask8 __U, __m256 __A)
+{
+ return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtpd_epi64 (__m512d __A)
+{
+ return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A)
+{
+ return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
+ (__v8di) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtpd_epi64 (__mmask8 __U, __m512d __A)
+{
+ return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtpd_epu64 (__m512d __A)
+{
+ return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A)
+{
+ return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
+ (__v8di) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtpd_epu64 (__mmask8 __U, __m512d __A)
+{
+ return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtps_epi64 (__m256 __A)
+{
+ return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtps_epi64 (__m512i __W, __mmask8 __U, __m256 __A)
+{
+ return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
+ (__v8di) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtps_epi64 (__mmask8 __U, __m256 __A)
+{
+ return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtps_epu64 (__m256 __A)
+{
+ return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtps_epu64 (__m512i __W, __mmask8 __U, __m256 __A)
+{
+ return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
+ (__v8di) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtps_epu64 (__mmask8 __U, __m256 __A)
+{
+ return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi64_ps (__m512i __A)
+{
+ return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi64_ps (__m256 __W, __mmask8 __U, __m512i __A)
+{
+ return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
+ (__v8sf) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi64_ps (__mmask8 __U, __m512i __A)
+{
+ return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu64_ps (__m512i __A)
+{
+ return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu64_ps (__m256 __W, __mmask8 __U, __m512i __A)
+{
+ return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
+ (__v8sf) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu64_ps (__mmask8 __U, __m512i __A)
+{
+ return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi64_pd (__m512i __A)
+{
+ return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi64_pd (__m512d __W, __mmask8 __U, __m512i __A)
+{
+ return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,
+ (__v8df) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi64_pd (__mmask8 __U, __m512i __A)
+{
+ return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu64_pd (__m512i __A)
+{
+ return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu64_pd (__m512d __W, __mmask8 __U, __m512i __A)
+{
+ return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,
+ (__v8df) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu64_pd (__mmask8 __U, __m512i __A)
+{
+ return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kshiftli_mask8 (__mmask8 __A, unsigned int __B)
+{
+ return (__mmask8) __builtin_ia32_kshiftliqi ((__mmask8) __A, (__mmask8) __B);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kshiftri_mask8 (__mmask8 __A, unsigned int __B)
+{
+ return (__mmask8) __builtin_ia32_kshiftriqi ((__mmask8) __A, (__mmask8) __B);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_range_pd (__m512d __A, __m512d __B, int __C)
+{
+ return (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A,
+ (__v8df) __B, __C,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_range_pd (__m512d __W, __mmask8 __U,
+ __m512d __A, __m512d __B, int __C)
+{
+ return (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A,
+ (__v8df) __B, __C,
+ (__v8df) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_range_pd (__mmask8 __U, __m512d __A, __m512d __B, int __C)
+{
+ return (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A,
+ (__v8df) __B, __C,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_range_ps (__m512 __A, __m512 __B, int __C)
+{
+ return (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A,
+ (__v16sf) __B, __C,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_range_ps (__m512 __W, __mmask16 __U,
+ __m512 __A, __m512 __B, int __C)
+{
+ return (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A,
+ (__v16sf) __B, __C,
+ (__v16sf) __W,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_range_ps (__mmask16 __U, __m512 __A, __m512 __B, int __C)
+{
+ return (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A,
+ (__v16sf) __B, __C,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_sd (__m128d __A, __m128d __B, int __C)
+{
+ return (__m128d) __builtin_ia32_reducesd_mask ((__v2df) __A,
+ (__v2df) __B, __C,
+ (__v2df) _mm_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_round_sd (__m128d __A, __m128d __B, int __C, const int __R)
+{
+ return (__m128d) __builtin_ia32_reducesd_mask_round ((__v2df) __A,
+ (__v2df) __B, __C,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_sd (__m128d __W, __mmask8 __U, __m128d __A,
+ __m128d __B, int __C)
+{
+ return (__m128d) __builtin_ia32_reducesd_mask ((__v2df) __A,
+ (__v2df) __B, __C,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_round_sd (__m128d __W, __mmask8 __U, __m128d __A,
+ __m128d __B, int __C, const int __R)
+{
+ return (__m128d) __builtin_ia32_reducesd_mask_round ((__v2df) __A,
+ (__v2df) __B, __C,
+ (__v2df) __W,
+ __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_reduce_sd (__mmask8 __U, __m128d __A, __m128d __B, int __C)
+{
+ return (__m128d) __builtin_ia32_reducesd_mask ((__v2df) __A,
+ (__v2df) __B, __C,
+ (__v2df) _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_reduce_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
+ int __C, const int __R)
+{
+ return (__m128d) __builtin_ia32_reducesd_mask_round ((__v2df) __A,
+ (__v2df) __B, __C,
+ (__v2df)
+ _mm_setzero_pd (),
+ __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_ss (__m128 __A, __m128 __B, int __C)
+{
+ return (__m128) __builtin_ia32_reducess_mask ((__v4sf) __A,
+ (__v4sf) __B, __C,
+ (__v4sf) _mm_setzero_ps (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_round_ss (__m128 __A, __m128 __B, int __C, const int __R)
+{
+ return (__m128) __builtin_ia32_reducess_mask_round ((__v4sf) __A,
+ (__v4sf) __B, __C,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_ss (__m128 __W, __mmask8 __U, __m128 __A,
+ __m128 __B, int __C)
+{
+ return (__m128) __builtin_ia32_reducess_mask ((__v4sf) __A,
+ (__v4sf) __B, __C,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_round_ss (__m128 __W, __mmask8 __U, __m128 __A,
+ __m128 __B, int __C, const int __R)
+{
+ return (__m128) __builtin_ia32_reducess_mask_round ((__v4sf) __A,
+ (__v4sf) __B, __C,
+ (__v4sf) __W,
+ __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_reduce_ss (__mmask8 __U, __m128 __A, __m128 __B, int __C)
+{
+ return (__m128) __builtin_ia32_reducess_mask ((__v4sf) __A,
+ (__v4sf) __B, __C,
+ (__v4sf) _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_reduce_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
+ int __C, const int __R)
+{
+ return (__m128) __builtin_ia32_reducess_mask_round ((__v4sf) __A,
+ (__v4sf) __B, __C,
+ (__v4sf)
+ _mm_setzero_ps (),
+ __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_range_sd (__m128d __A, __m128d __B, int __C)
+{
+ return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A,
+ (__v2df) __B, __C,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_range_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, int __C)
+{
+ return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A,
+ (__v2df) __B, __C,
+ (__v2df) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_range_sd (__mmask8 __U, __m128d __A, __m128d __B, int __C)
+{
+ return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A,
+ (__v2df) __B, __C,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_range_ss (__m128 __A, __m128 __B, int __C)
+{
+ return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A,
+ (__v4sf) __B, __C,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_range_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, int __C)
+{
+ return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A,
+ (__v4sf) __B, __C,
+ (__v4sf) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_range_ss (__mmask8 __U, __m128 __A, __m128 __B, int __C)
+{
+ return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A,
+ (__v4sf) __B, __C,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_range_round_sd (__m128d __A, __m128d __B, int __C, const int __R)
+{
+ return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A,
+ (__v2df) __B, __C,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_range_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B,
+ int __C, const int __R)
+{
+ return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A,
+ (__v2df) __B, __C,
+ (__v2df) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_range_round_sd (__mmask8 __U, __m128d __A, __m128d __B, int __C,
+ const int __R)
+{
+ return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A,
+ (__v2df) __B, __C,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_range_round_ss (__m128 __A, __m128 __B, int __C, const int __R)
+{
+ return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A,
+ (__v4sf) __B, __C,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_range_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B,
+ int __C, const int __R)
+{
+ return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A,
+ (__v4sf) __B, __C,
+ (__v4sf) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_range_round_ss (__mmask8 __U, __m128 __A, __m128 __B, int __C,
+ const int __R)
+{
+ return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A,
+ (__v4sf) __B, __C,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fpclass_ss_mask (__m128 __A, const int __imm)
+{
+ return (__mmask8) __builtin_ia32_fpclassss_mask ((__v4sf) __A, __imm,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fpclass_sd_mask (__m128d __A, const int __imm)
+{
+ return (__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) __A, __imm,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fpclass_ss_mask (__mmask8 __U, __m128 __A, const int __imm)
+{
+ return (__mmask8) __builtin_ia32_fpclassss_mask ((__v4sf) __A, __imm, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fpclass_sd_mask (__mmask8 __U, __m128d __A, const int __imm)
+{
+ return (__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) __A, __imm, __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundpd_epi64 (__m512d __A, const int __R)
+{
+ return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A,
+ const int __R)
+{
+ return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
+ (__v8di) __W,
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundpd_epi64 (__mmask8 __U, __m512d __A,
+ const int __R)
+{
+ return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundpd_epu64 (__m512d __A, const int __R)
+{
+ return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A,
+ const int __R)
+{
+ return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
+ (__v8di) __W,
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundpd_epu64 (__mmask8 __U, __m512d __A,
+ const int __R)
+{
+ return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundps_epi64 (__m256 __A, const int __R)
+{
+ return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundps_epi64 (__m512i __W, __mmask8 __U, __m256 __A,
+ const int __R)
+{
+ return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
+ (__v8di) __W,
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundps_epi64 (__mmask8 __U, __m256 __A,
+ const int __R)
+{
+ return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundps_epu64 (__m256 __A, const int __R)
+{
+ return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundps_epu64 (__m512i __W, __mmask8 __U, __m256 __A,
+ const int __R)
+{
+ return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
+ (__v8di) __W,
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundps_epu64 (__mmask8 __U, __m256 __A,
+ const int __R)
+{
+ return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundpd_epi64 (__m512d __A, const int __R)
+{
+ return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A,
+ const int __R)
+{
+ return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
+ (__v8di) __W,
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundpd_epi64 (__mmask8 __U, __m512d __A,
+ const int __R)
+{
+ return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundpd_epu64 (__m512d __A, const int __R)
+{
+ return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A,
+ const int __R)
+{
+ return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
+ (__v8di) __W,
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundpd_epu64 (__mmask8 __U, __m512d __A,
+ const int __R)
+{
+ return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundps_epi64 (__m256 __A, const int __R)
+{
+ return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundps_epi64 (__m512i __W, __mmask8 __U, __m256 __A,
+ const int __R)
+{
+ return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
+ (__v8di) __W,
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundps_epi64 (__mmask8 __U, __m256 __A,
+ const int __R)
+{
+ return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundps_epu64 (__m256 __A, const int __R)
+{
+ return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundps_epu64 (__m512i __W, __mmask8 __U, __m256 __A,
+ const int __R)
+{
+ return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
+ (__v8di) __W,
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundps_epu64 (__mmask8 __U, __m256 __A,
+ const int __R)
+{
+ return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepi64_ps (__m512i __A, const int __R)
+{
+ return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepi64_ps (__m256 __W, __mmask8 __U, __m512i __A,
+ const int __R)
+{
+ return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
+ (__v8sf) __W,
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepi64_ps (__mmask8 __U, __m512i __A,
+ const int __R)
+{
+ return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepu64_ps (__m512i __A, const int __R)
+{
+ return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepu64_ps (__m256 __W, __mmask8 __U, __m512i __A,
+ const int __R)
+{
+ return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
+ (__v8sf) __W,
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepu64_ps (__mmask8 __U, __m512i __A,
+ const int __R)
+{
+ return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepi64_pd (__m512i __A, const int __R)
+{
+ return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepi64_pd (__m512d __W, __mmask8 __U, __m512i __A,
+ const int __R)
+{
+ return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,
+ (__v8df) __W,
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepi64_pd (__mmask8 __U, __m512i __A,
+ const int __R)
+{
+ return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepu64_pd (__m512i __A, const int __R)
+{
+ return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepu64_pd (__m512d __W, __mmask8 __U, __m512i __A,
+ const int __R)
+{
+ return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,
+ (__v8df) __W,
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepu64_pd (__mmask8 __U, __m512i __A,
+ const int __R)
+{
+ return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_pd (__m512d __A, int __B)
+{
+ return (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_round_pd (__m512d __A, int __B, const int __R)
+{
+ return (__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df) __A,
+ __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_pd (__m512d __W, __mmask8 __U, __m512d __A, int __B)
+{
+ return (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
+ int __B, const int __R)
+{
+ return (__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df) __A,
+ __B,
+ (__v8df) __W,
+ __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_reduce_pd (__mmask8 __U, __m512d __A, int __B)
+{
+ return (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_reduce_round_pd (__mmask8 __U, __m512d __A, int __B,
+ const int __R)
+{
+ return (__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df) __A,
+ __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_ps (__m512 __A, int __B)
+{
+ return (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_round_ps (__m512 __A, int __B, const int __R)
+{
+ return (__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf) __A,
+ __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_ps (__m512 __W, __mmask16 __U, __m512 __A, int __B)
+{
+ return (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_round_ps (__m512 __W, __mmask16 __U, __m512 __A, int __B,
+ const int __R)
+{
+ return (__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf) __A,
+ __B,
+ (__v16sf) __W,
+ __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_reduce_ps (__mmask16 __U, __m512 __A, int __B)
+{
+ return (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_reduce_round_ps (__mmask16 __U, __m512 __A, int __B,
+ const int __R)
+{
+ return (__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf) __A,
+ __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ __U, __R);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_extractf32x8_ps (__m512 __A, const int __imm)
+{
+ return (__m256) __builtin_ia32_extractf32x8_mask ((__v16sf) __A,
+ __imm,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_extractf32x8_ps (__m256 __W, __mmask8 __U, __m512 __A,
+ const int __imm)
+{
+ return (__m256) __builtin_ia32_extractf32x8_mask ((__v16sf) __A,
+ __imm,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_extractf32x8_ps (__mmask8 __U, __m512 __A,
+ const int __imm)
+{
+ return (__m256) __builtin_ia32_extractf32x8_mask ((__v16sf) __A,
+ __imm,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_extractf64x2_pd (__m512d __A, const int __imm)
+{
+ return (__m128d) __builtin_ia32_extractf64x2_512_mask ((__v8df) __A,
+ __imm,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_extractf64x2_pd (__m128d __W, __mmask8 __U, __m512d __A,
+ const int __imm)
+{
+ return (__m128d) __builtin_ia32_extractf64x2_512_mask ((__v8df) __A,
+ __imm,
+ (__v2df) __W,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_extractf64x2_pd (__mmask8 __U, __m512d __A,
+ const int __imm)
+{
+ return (__m128d) __builtin_ia32_extractf64x2_512_mask ((__v8df) __A,
+ __imm,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_extracti32x8_epi32 (__m512i __A, const int __imm)
+{
+ return (__m256i) __builtin_ia32_extracti32x8_mask ((__v16si) __A,
+ __imm,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_extracti32x8_epi32 (__m256i __W, __mmask8 __U, __m512i __A,
+ const int __imm)
+{
+ return (__m256i) __builtin_ia32_extracti32x8_mask ((__v16si) __A,
+ __imm,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_extracti32x8_epi32 (__mmask8 __U, __m512i __A,
+ const int __imm)
+{
+ return (__m256i) __builtin_ia32_extracti32x8_mask ((__v16si) __A,
+ __imm,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_extracti64x2_epi64 (__m512i __A, const int __imm)
+{
+ return (__m128i) __builtin_ia32_extracti64x2_512_mask ((__v8di) __A,
+ __imm,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_extracti64x2_epi64 (__m128i __W, __mmask8 __U, __m512i __A,
+ const int __imm)
+{
+ return (__m128i) __builtin_ia32_extracti64x2_512_mask ((__v8di) __A,
+ __imm,
+ (__v2di) __W,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_extracti64x2_epi64 (__mmask8 __U, __m512i __A,
+ const int __imm)
+{
+ return (__m128i) __builtin_ia32_extracti64x2_512_mask ((__v8di) __A,
+ __imm,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_range_round_pd (__m512d __A, __m512d __B, int __C,
+ const int __R)
+{
+ return (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A,
+ (__v8df) __B, __C,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_range_round_pd (__m512d __W, __mmask8 __U,
+ __m512d __A, __m512d __B, int __C,
+ const int __R)
+{
+ return (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A,
+ (__v8df) __B, __C,
+ (__v8df) __W,
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_range_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+ int __C, const int __R)
+{
+ return (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A,
+ (__v8df) __B, __C,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U,
+ __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_range_round_ps (__m512 __A, __m512 __B, int __C, const int __R)
+{
+ return (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A,
+ (__v16sf) __B, __C,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) -1,
+ __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_range_round_ps (__m512 __W, __mmask16 __U,
+ __m512 __A, __m512 __B, int __C,
+ const int __R)
+{
+ return (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A,
+ (__v16sf) __B, __C,
+ (__v16sf) __W,
+ (__mmask16) __U,
+ __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_range_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
+ int __C, const int __R)
+{
+ return (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A,
+ (__v16sf) __B, __C,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U,
+ __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_inserti32x8 (__m512i __A, __m256i __B, const int __imm)
+{
+ return (__m512i) __builtin_ia32_inserti32x8_mask ((__v16si) __A,
+ (__v8si) __B,
+ __imm,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_inserti32x8 (__m512i __W, __mmask16 __U, __m512i __A,
+ __m256i __B, const int __imm)
+{
+ return (__m512i) __builtin_ia32_inserti32x8_mask ((__v16si) __A,
+ (__v8si) __B,
+ __imm,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_inserti32x8 (__mmask16 __U, __m512i __A, __m256i __B,
+ const int __imm)
+{
+ return (__m512i) __builtin_ia32_inserti32x8_mask ((__v16si) __A,
+ (__v8si) __B,
+ __imm,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_insertf32x8 (__m512 __A, __m256 __B, const int __imm)
+{
+ return (__m512) __builtin_ia32_insertf32x8_mask ((__v16sf) __A,
+ (__v8sf) __B,
+ __imm,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_insertf32x8 (__m512 __W, __mmask16 __U, __m512 __A,
+ __m256 __B, const int __imm)
+{
+ return (__m512) __builtin_ia32_insertf32x8_mask ((__v16sf) __A,
+ (__v8sf) __B,
+ __imm,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_insertf32x8 (__mmask16 __U, __m512 __A, __m256 __B,
+ const int __imm)
+{
+ return (__m512) __builtin_ia32_insertf32x8_mask ((__v16sf) __A,
+ (__v8sf) __B,
+ __imm,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_inserti64x2 (__m512i __A, __m128i __B, const int __imm)
+{
+ return (__m512i) __builtin_ia32_inserti64x2_512_mask ((__v8di) __A,
+ (__v2di) __B,
+ __imm,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_inserti64x2 (__m512i __W, __mmask8 __U, __m512i __A,
+ __m128i __B, const int __imm)
+{
+ return (__m512i) __builtin_ia32_inserti64x2_512_mask ((__v8di) __A,
+ (__v2di) __B,
+ __imm,
+ (__v8di) __W,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_inserti64x2 (__mmask8 __U, __m512i __A, __m128i __B,
+ const int __imm)
+{
+ return (__m512i) __builtin_ia32_inserti64x2_512_mask ((__v8di) __A,
+ (__v2di) __B,
+ __imm,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_insertf64x2 (__m512d __A, __m128d __B, const int __imm)
+{
+ return (__m512d) __builtin_ia32_insertf64x2_512_mask ((__v8df) __A,
+ (__v2df) __B,
+ __imm,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_insertf64x2 (__m512d __W, __mmask8 __U, __m512d __A,
+ __m128d __B, const int __imm)
+{
+ return (__m512d) __builtin_ia32_insertf64x2_512_mask ((__v8df) __A,
+ (__v2df) __B,
+ __imm,
+ (__v8df) __W,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_insertf64x2 (__mmask8 __U, __m512d __A, __m128d __B,
+ const int __imm)
+{
+ return (__m512d) __builtin_ia32_insertf64x2_512_mask ((__v8df) __A,
+ (__v2df) __B,
+ __imm,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8)
+ __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fpclass_pd_mask (__mmask8 __U, __m512d __A,
+ const int __imm)
+{
+ return (__mmask8) __builtin_ia32_fpclasspd512_mask ((__v8df) __A,
+ __imm, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fpclass_pd_mask (__m512d __A, const int __imm)
+{
+ return (__mmask8) __builtin_ia32_fpclasspd512_mask ((__v8df) __A,
+ __imm,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fpclass_ps_mask (__mmask16 __U, __m512 __A,
+ const int __imm)
+{
+ return (__mmask16) __builtin_ia32_fpclassps512_mask ((__v16sf) __A,
+ __imm, __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fpclass_ps_mask (__m512 __A, const int __imm)
+{
+ return (__mmask16) __builtin_ia32_fpclassps512_mask ((__v16sf) __A,
+ __imm,
+ (__mmask16) -1);
+}
+
+#else
+#define _kshiftli_mask8(X, Y) \
+ ((__mmask8) __builtin_ia32_kshiftliqi ((__mmask8)(X), (__mmask8)(Y)))
+
+#define _kshiftri_mask8(X, Y) \
+ ((__mmask8) __builtin_ia32_kshiftriqi ((__mmask8)(X), (__mmask8)(Y)))
+
+#define _mm_range_sd(A, B, C) \
+ ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), \
+ (__mmask8) -1, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_range_sd(W, U, A, B, C) \
+ ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W), \
+ (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_range_sd(U, A, B, C) \
+ ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), \
+ (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_range_ss(A, B, C) \
+ ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), \
+ (__mmask8) -1, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_range_ss(W, U, A, B, C) \
+ ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), \
+ (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_range_ss(U, A, B, C) \
+ ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), \
+ (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_range_round_sd(A, B, C, R) \
+ ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), \
+ (__mmask8) -1, (R)))
+
+#define _mm_mask_range_round_sd(W, U, A, B, C, R) \
+ ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W), \
+ (__mmask8)(U), (R)))
+
+#define _mm_maskz_range_round_sd(U, A, B, C, R) \
+ ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), \
+ (__mmask8)(U), (R)))
+
+#define _mm_range_round_ss(A, B, C, R) \
+ ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), \
+ (__mmask8) -1, (R)))
+
+#define _mm_mask_range_round_ss(W, U, A, B, C, R) \
+ ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), \
+ (__mmask8)(U), (R)))
+
+#define _mm_maskz_range_round_ss(U, A, B, C, R) \
+ ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), \
+ (__mmask8)(U), (R)))
+
+#define _mm512_cvtt_roundpd_epi64(A, B) \
+ ((__m512i)__builtin_ia32_cvttpd2qq512_mask ((A), (__v8di) \
+ _mm512_setzero_si512 (), \
+ -1, (B)))
+
+#define _mm512_mask_cvtt_roundpd_epi64(W, U, A, B) \
+ ((__m512i)__builtin_ia32_cvttpd2qq512_mask ((A), (__v8di)(W), (U), (B)))
+
+#define _mm512_maskz_cvtt_roundpd_epi64(U, A, B) \
+ ((__m512i)__builtin_ia32_cvttpd2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B)))
+
+#define _mm512_cvtt_roundpd_epu64(A, B) \
+ ((__m512i)__builtin_ia32_cvttpd2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B)))
+
+#define _mm512_mask_cvtt_roundpd_epu64(W, U, A, B) \
+ ((__m512i)__builtin_ia32_cvttpd2uqq512_mask ((A), (__v8di)(W), (U), (B)))
+
+#define _mm512_maskz_cvtt_roundpd_epu64(U, A, B) \
+ ((__m512i)__builtin_ia32_cvttpd2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B)))
+
+#define _mm512_cvtt_roundps_epi64(A, B) \
+ ((__m512i)__builtin_ia32_cvttps2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B)))
+
+#define _mm512_mask_cvtt_roundps_epi64(W, U, A, B) \
+ ((__m512i)__builtin_ia32_cvttps2qq512_mask ((A), (__v8di)(W), (U), (B)))
+
+#define _mm512_maskz_cvtt_roundps_epi64(U, A, B) \
+ ((__m512i)__builtin_ia32_cvttps2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B)))
+
+#define _mm512_cvtt_roundps_epu64(A, B) \
+ ((__m512i)__builtin_ia32_cvttps2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B)))
+
+#define _mm512_mask_cvtt_roundps_epu64(W, U, A, B) \
+ ((__m512i)__builtin_ia32_cvttps2uqq512_mask ((A), (__v8di)(W), (U), (B)))
+
+#define _mm512_maskz_cvtt_roundps_epu64(U, A, B) \
+ ((__m512i)__builtin_ia32_cvttps2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B)))
+
+#define _mm512_cvt_roundpd_epi64(A, B) \
+ ((__m512i)__builtin_ia32_cvtpd2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B)))
+
+#define _mm512_mask_cvt_roundpd_epi64(W, U, A, B) \
+ ((__m512i)__builtin_ia32_cvtpd2qq512_mask ((A), (__v8di)(W), (U), (B)))
+
+#define _mm512_maskz_cvt_roundpd_epi64(U, A, B) \
+ ((__m512i)__builtin_ia32_cvtpd2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B)))
+
+#define _mm512_cvt_roundpd_epu64(A, B) \
+ ((__m512i)__builtin_ia32_cvtpd2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B)))
+
+#define _mm512_mask_cvt_roundpd_epu64(W, U, A, B) \
+ ((__m512i)__builtin_ia32_cvtpd2uqq512_mask ((A), (__v8di)(W), (U), (B)))
+
+#define _mm512_maskz_cvt_roundpd_epu64(U, A, B) \
+ ((__m512i)__builtin_ia32_cvtpd2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B)))
+
+#define _mm512_cvt_roundps_epi64(A, B) \
+ ((__m512i)__builtin_ia32_cvtps2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B)))
+
+#define _mm512_mask_cvt_roundps_epi64(W, U, A, B) \
+ ((__m512i)__builtin_ia32_cvtps2qq512_mask ((A), (__v8di)(W), (U), (B)))
+
+#define _mm512_maskz_cvt_roundps_epi64(U, A, B) \
+ ((__m512i)__builtin_ia32_cvtps2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B)))
+
+#define _mm512_cvt_roundps_epu64(A, B) \
+ ((__m512i)__builtin_ia32_cvtps2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B)))
+
+#define _mm512_mask_cvt_roundps_epu64(W, U, A, B) \
+ ((__m512i)__builtin_ia32_cvtps2uqq512_mask ((A), (__v8di)(W), (U), (B)))
+
+#define _mm512_maskz_cvt_roundps_epu64(U, A, B) \
+ ((__m512i)__builtin_ia32_cvtps2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B)))
+
+#define _mm512_cvt_roundepi64_ps(A, B) \
+ ((__m256)__builtin_ia32_cvtqq2ps512_mask ((__v8di)(A), (__v8sf)_mm256_setzero_ps (), -1, (B)))
+
+#define _mm512_mask_cvt_roundepi64_ps(W, U, A, B) \
+ ((__m256)__builtin_ia32_cvtqq2ps512_mask ((__v8di)(A), (W), (U), (B)))
+
+#define _mm512_maskz_cvt_roundepi64_ps(U, A, B) \
+ ((__m256)__builtin_ia32_cvtqq2ps512_mask ((__v8di)(A), (__v8sf)_mm256_setzero_ps (), (U), (B)))
+
+#define _mm512_cvt_roundepu64_ps(A, B) \
+ ((__m256)__builtin_ia32_cvtuqq2ps512_mask ((__v8di)(A), (__v8sf)_mm256_setzero_ps (), -1, (B)))
+
+#define _mm512_mask_cvt_roundepu64_ps(W, U, A, B) \
+ ((__m256)__builtin_ia32_cvtuqq2ps512_mask ((__v8di)(A), (W), (U), (B)))
+
+#define _mm512_maskz_cvt_roundepu64_ps(U, A, B) \
+ ((__m256)__builtin_ia32_cvtuqq2ps512_mask ((__v8di)(A), (__v8sf)_mm256_setzero_ps (), (U), (B)))
+
+#define _mm512_cvt_roundepi64_pd(A, B) \
+ ((__m512d)__builtin_ia32_cvtqq2pd512_mask ((__v8di)(A), (__v8df)_mm512_setzero_pd (), -1, (B)))
+
+#define _mm512_mask_cvt_roundepi64_pd(W, U, A, B) \
+ ((__m512d)__builtin_ia32_cvtqq2pd512_mask ((__v8di)(A), (W), (U), (B)))
+
+#define _mm512_maskz_cvt_roundepi64_pd(U, A, B) \
+ ((__m512d)__builtin_ia32_cvtqq2pd512_mask ((__v8di)(A), (__v8df)_mm512_setzero_pd (), (U), (B)))
+
+#define _mm512_cvt_roundepu64_pd(A, B) \
+ ((__m512d)__builtin_ia32_cvtuqq2pd512_mask ((__v8di)(A), (__v8df)_mm512_setzero_pd (), -1, (B)))
+
+#define _mm512_mask_cvt_roundepu64_pd(W, U, A, B) \
+ ((__m512d)__builtin_ia32_cvtuqq2pd512_mask ((__v8di)(A), (W), (U), (B)))
+
+#define _mm512_maskz_cvt_roundepu64_pd(U, A, B) \
+ ((__m512d)__builtin_ia32_cvtuqq2pd512_mask ((__v8di)(A), (__v8df)_mm512_setzero_pd (), (U), (B)))
+
+#define _mm512_reduce_pd(A, B) \
+ ((__m512d) __builtin_ia32_reducepd512_mask ((__v8df)(__m512d)(A), \
+ (int)(B), (__v8df)_mm512_setzero_pd (), (__mmask8)-1))
+
+#define _mm512_reduce_round_pd(A, B, R) \
+ ((__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df)(__m512d)(A),\
+ (int)(B), (__v8df)_mm512_setzero_pd (), (__mmask8)-1, (R)))
+
+#define _mm512_mask_reduce_pd(W, U, A, B) \
+ ((__m512d) __builtin_ia32_reducepd512_mask ((__v8df)(__m512d)(A), \
+ (int)(B), (__v8df)(__m512d)(W), (__mmask8)(U)))
+
+#define _mm512_mask_reduce_round_pd(W, U, A, B, R) \
+ ((__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df)(__m512d)(A),\
+ (int)(B), (__v8df)(__m512d)(W), (U), (R)))
+
+#define _mm512_maskz_reduce_pd(U, A, B) \
+ ((__m512d) __builtin_ia32_reducepd512_mask ((__v8df)(__m512d)(A), \
+ (int)(B), (__v8df)_mm512_setzero_pd (), (__mmask8)(U)))
+
+#define _mm512_maskz_reduce_round_pd(U, A, B, R) \
+ ((__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df)(__m512d)(A),\
+ (int)(B), (__v8df)_mm512_setzero_pd (), (U), (R)))
+
+#define _mm512_reduce_ps(A, B) \
+ ((__m512) __builtin_ia32_reduceps512_mask ((__v16sf)(__m512)(A), \
+ (int)(B), (__v16sf)_mm512_setzero_ps (), (__mmask16)-1))
+
+#define _mm512_reduce_round_ps(A, B, R) \
+ ((__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf)(__m512)(A),\
+ (int)(B), (__v16sf)_mm512_setzero_ps (), (__mmask16)-1, (R)))
+
+#define _mm512_mask_reduce_ps(W, U, A, B) \
+ ((__m512) __builtin_ia32_reduceps512_mask ((__v16sf)(__m512)(A), \
+ (int)(B), (__v16sf)(__m512)(W), (__mmask16)(U)))
+
+#define _mm512_mask_reduce_round_ps(W, U, A, B, R) \
+ ((__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf)(__m512)(A),\
+ (int)(B), (__v16sf)(__m512)(W), (U), (R)))
+
+#define _mm512_maskz_reduce_ps(U, A, B) \
+ ((__m512) __builtin_ia32_reduceps512_mask ((__v16sf)(__m512)(A), \
+ (int)(B), (__v16sf)_mm512_setzero_ps (), (__mmask16)(U)))
+
+#define _mm512_maskz_reduce_round_ps(U, A, B, R) \
+ ((__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf)(__m512)(A),\
+ (int)(B), (__v16sf)_mm512_setzero_ps (), (__mmask16)(U), (R)))
+
+#define _mm512_extractf32x8_ps(X, C) \
+ ((__m256) __builtin_ia32_extractf32x8_mask ((__v16sf)(__m512) (X), \
+ (int) (C), (__v8sf)(__m256) _mm256_setzero_ps (), (__mmask8)-1))
+
+#define _mm512_mask_extractf32x8_ps(W, U, X, C) \
+ ((__m256) __builtin_ia32_extractf32x8_mask ((__v16sf)(__m512) (X), \
+ (int) (C), (__v8sf)(__m256) (W), (__mmask8) (U)))
+
+#define _mm512_maskz_extractf32x8_ps(U, X, C) \
+ ((__m256) __builtin_ia32_extractf32x8_mask ((__v16sf)(__m512) (X), \
+ (int) (C), (__v8sf)(__m256) _mm256_setzero_ps (), (__mmask8) (U)))
+
+#define _mm512_extractf64x2_pd(X, C) \
+ ((__m128d) __builtin_ia32_extractf64x2_512_mask ((__v8df)(__m512d) (X),\
+ (int) (C), (__v2df)(__m128d) _mm_setzero_pd (), (__mmask8)-1))
+
+#define _mm512_mask_extractf64x2_pd(W, U, X, C) \
+ ((__m128d) __builtin_ia32_extractf64x2_512_mask ((__v8df)(__m512d) (X),\
+ (int) (C), (__v2df)(__m128d) (W), (__mmask8) (U)))
+
+#define _mm512_maskz_extractf64x2_pd(U, X, C) \
+ ((__m128d) __builtin_ia32_extractf64x2_512_mask ((__v8df)(__m512d) (X),\
+ (int) (C), (__v2df)(__m128d) _mm_setzero_pd (), (__mmask8) (U)))
+
+#define _mm512_extracti32x8_epi32(X, C) \
+ ((__m256i) __builtin_ia32_extracti32x8_mask ((__v16si)(__m512i) (X), \
+ (int) (C), (__v8si)(__m256i) _mm256_setzero_si256 (), (__mmask8)-1))
+
+#define _mm512_mask_extracti32x8_epi32(W, U, X, C) \
+ ((__m256i) __builtin_ia32_extracti32x8_mask ((__v16si)(__m512i) (X), \
+ (int) (C), (__v8si)(__m256i) (W), (__mmask8) (U)))
+
+#define _mm512_maskz_extracti32x8_epi32(U, X, C) \
+ ((__m256i) __builtin_ia32_extracti32x8_mask ((__v16si)(__m512i) (X), \
+ (int) (C), (__v8si)(__m256i) _mm256_setzero_si256 (), (__mmask8) (U)))
+
+#define _mm512_extracti64x2_epi64(X, C) \
+ ((__m128i) __builtin_ia32_extracti64x2_512_mask ((__v8di)(__m512i) (X),\
+ (int) (C), (__v2di)(__m128i) _mm_setzero_si128 (), (__mmask8)-1))
+
+#define _mm512_mask_extracti64x2_epi64(W, U, X, C) \
+ ((__m128i) __builtin_ia32_extracti64x2_512_mask ((__v8di)(__m512i) (X),\
+ (int) (C), (__v2di)(__m128i) (W), (__mmask8) (U)))
+
+#define _mm512_maskz_extracti64x2_epi64(U, X, C) \
+ ((__m128i) __builtin_ia32_extracti64x2_512_mask ((__v8di)(__m512i) (X),\
+ (int) (C), (__v2di)(__m128i) _mm_setzero_si128 (), (__mmask8) (U)))
+
+#define _mm512_range_pd(A, B, C) \
+ ((__m512d) __builtin_ia32_rangepd512_mask ((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), (int)(C), \
+ (__v8df)_mm512_setzero_pd (), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_range_pd(W, U, A, B, C) \
+ ((__m512d) __builtin_ia32_rangepd512_mask ((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), (int)(C), \
+ (__v8df)(__m512d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_range_pd(U, A, B, C) \
+ ((__m512d) __builtin_ia32_rangepd512_mask ((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), (int)(C), \
+ (__v8df)_mm512_setzero_pd (), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_range_ps(A, B, C) \
+ ((__m512) __builtin_ia32_rangeps512_mask ((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), (int)(C), \
+ (__v16sf)_mm512_setzero_ps (), (__mmask16)-1, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_range_ps(W, U, A, B, C) \
+ ((__m512) __builtin_ia32_rangeps512_mask ((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), (int)(C), \
+ (__v16sf)(__m512)(W), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_range_ps(U, A, B, C) \
+ ((__m512) __builtin_ia32_rangeps512_mask ((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), (int)(C), \
+ (__v16sf)_mm512_setzero_ps (), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_range_round_pd(A, B, C, R) \
+ ((__m512d) __builtin_ia32_rangepd512_mask ((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), (int)(C), \
+ (__v8df)_mm512_setzero_pd (), (__mmask8)-1, (R)))
+
+#define _mm512_mask_range_round_pd(W, U, A, B, C, R) \
+ ((__m512d) __builtin_ia32_rangepd512_mask ((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), (int)(C), \
+ (__v8df)(__m512d)(W), (__mmask8)(U), (R)))
+
+#define _mm512_maskz_range_round_pd(U, A, B, C, R) \
+ ((__m512d) __builtin_ia32_rangepd512_mask ((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(B), (int)(C), \
+ (__v8df)_mm512_setzero_pd (), (__mmask8)(U), (R)))
+
+#define _mm512_range_round_ps(A, B, C, R) \
+ ((__m512) __builtin_ia32_rangeps512_mask ((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), (int)(C), \
+ (__v16sf)_mm512_setzero_ps (), (__mmask16)-1, (R)))
+
+#define _mm512_mask_range_round_ps(W, U, A, B, C, R) \
+ ((__m512) __builtin_ia32_rangeps512_mask ((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), (int)(C), \
+ (__v16sf)(__m512)(W), (__mmask16)(U), (R)))
+
+#define _mm512_maskz_range_round_ps(U, A, B, C, R) \
+ ((__m512) __builtin_ia32_rangeps512_mask ((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(B), (int)(C), \
+ (__v16sf)_mm512_setzero_ps (), (__mmask16)(U), (R)))
+
+#define _mm512_insertf64x2(X, Y, C) \
+ ((__m512d) __builtin_ia32_insertf64x2_512_mask ((__v8df)(__m512d) (X),\
+ (__v2df)(__m128d) (Y), (int) (C), (__v8df)(__m512d) (X), \
+ (__mmask8)-1))
+
+#define _mm512_mask_insertf64x2(W, U, X, Y, C) \
+ ((__m512d) __builtin_ia32_insertf64x2_512_mask ((__v8df)(__m512d) (X),\
+ (__v2df)(__m128d) (Y), (int) (C), (__v8df)(__m512d) (W), \
+ (__mmask8) (U)))
+
+#define _mm512_maskz_insertf64x2(U, X, Y, C) \
+ ((__m512d) __builtin_ia32_insertf64x2_512_mask ((__v8df)(__m512d) (X),\
+ (__v2df)(__m128d) (Y), (int) (C), \
+ (__v8df)(__m512d) _mm512_setzero_pd (), (__mmask8) (U)))
+
+#define _mm512_inserti64x2(X, Y, C) \
+ ((__m512i) __builtin_ia32_inserti64x2_512_mask ((__v8di)(__m512i) (X),\
+ (__v2di)(__m128i) (Y), (int) (C), (__v8di)(__m512i) (X), (__mmask8)-1))
+
+#define _mm512_mask_inserti64x2(W, U, X, Y, C) \
+ ((__m512i) __builtin_ia32_inserti64x2_512_mask ((__v8di)(__m512i) (X),\
+ (__v2di)(__m128i) (Y), (int) (C), (__v8di)(__m512i) (W), \
+ (__mmask8) (U)))
+
+#define _mm512_maskz_inserti64x2(U, X, Y, C) \
+ ((__m512i) __builtin_ia32_inserti64x2_512_mask ((__v8di)(__m512i) (X),\
+ (__v2di)(__m128i) (Y), (int) (C), \
+ (__v8di)(__m512i) _mm512_setzero_si512 (), (__mmask8) (U)))
+
+#define _mm512_insertf32x8(X, Y, C) \
+ ((__m512) __builtin_ia32_insertf32x8_mask ((__v16sf)(__m512) (X), \
+ (__v8sf)(__m256) (Y), (int) (C),\
+ (__v16sf)(__m512)_mm512_setzero_ps (),\
+ (__mmask16)-1))
+
+#define _mm512_mask_insertf32x8(W, U, X, Y, C) \
+ ((__m512) __builtin_ia32_insertf32x8_mask ((__v16sf)(__m512) (X), \
+ (__v8sf)(__m256) (Y), (int) (C),\
+ (__v16sf)(__m512)(W),\
+ (__mmask16)(U)))
+
+#define _mm512_maskz_insertf32x8(U, X, Y, C) \
+ ((__m512) __builtin_ia32_insertf32x8_mask ((__v16sf)(__m512) (X), \
+ (__v8sf)(__m256) (Y), (int) (C),\
+ (__v16sf)(__m512)_mm512_setzero_ps (),\
+ (__mmask16)(U)))
+
+#define _mm512_inserti32x8(X, Y, C) \
+ ((__m512i) __builtin_ia32_inserti32x8_mask ((__v16si)(__m512i) (X), \
+ (__v8si)(__m256i) (Y), (int) (C),\
+ (__v16si)(__m512i)_mm512_setzero_si512 (),\
+ (__mmask16)-1))
+
+#define _mm512_mask_inserti32x8(W, U, X, Y, C) \
+ ((__m512i) __builtin_ia32_inserti32x8_mask ((__v16si)(__m512i) (X), \
+ (__v8si)(__m256i) (Y), (int) (C),\
+ (__v16si)(__m512i)(W),\
+ (__mmask16)(U)))
+
+#define _mm512_maskz_inserti32x8(U, X, Y, C) \
+ ((__m512i) __builtin_ia32_inserti32x8_mask ((__v16si)(__m512i) (X), \
+ (__v8si)(__m256i) (Y), (int) (C),\
+ (__v16si)(__m512i)_mm512_setzero_si512 (),\
+ (__mmask16)(U)))
+
+#define _mm_fpclass_ss_mask(X, C) \
+ ((__mmask8) __builtin_ia32_fpclassss_mask ((__v4sf) (__m128) (X), \
+ (int) (C), (__mmask8) (-1))) \
+
+#define _mm_fpclass_sd_mask(X, C) \
+ ((__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) (__m128d) (X), \
+ (int) (C), (__mmask8) (-1))) \
+
+#define _mm_mask_fpclass_ss_mask(X, C, U) \
+ ((__mmask8) __builtin_ia32_fpclassss_mask ((__v4sf) (__m128) (X), \
+ (int) (C), (__mmask8) (U)))
+
+#define _mm_mask_fpclass_sd_mask(X, C, U) \
+ ((__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) (__m128d) (X), \
+ (int) (C), (__mmask8) (U)))
+
+#define _mm512_mask_fpclass_pd_mask(u, X, C) \
+ ((__mmask8) __builtin_ia32_fpclasspd512_mask ((__v8df) (__m512d) (X), \
+ (int) (C), (__mmask8)(u)))
+
+#define _mm512_mask_fpclass_ps_mask(u, x, c) \
+ ((__mmask16) __builtin_ia32_fpclassps512_mask ((__v16sf) (__m512) (x),\
+ (int) (c),(__mmask16)(u)))
+
+#define _mm512_fpclass_pd_mask(X, C) \
+ ((__mmask8) __builtin_ia32_fpclasspd512_mask ((__v8df) (__m512d) (X), \
+ (int) (C), (__mmask8)-1))
+
+#define _mm512_fpclass_ps_mask(x, c) \
+ ((__mmask16) __builtin_ia32_fpclassps512_mask ((__v16sf) (__m512) (x),\
+ (int) (c),(__mmask16)-1))
+
+#define _mm_reduce_sd(A, B, C) \
+ ((__m128d) __builtin_ia32_reducesd_mask ((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), \
+ (__mmask8)-1))
+
+#define _mm_mask_reduce_sd(W, U, A, B, C) \
+ ((__m128d) __builtin_ia32_reducesd_mask ((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W), (__mmask8)(U)))
+
+#define _mm_maskz_reduce_sd(U, A, B, C) \
+ ((__m128d) __builtin_ia32_reducesd_mask ((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), \
+ (__mmask8)(U)))
+
+#define _mm_reduce_round_sd(A, B, C, R) \
+ ((__m128d) __builtin_ia32_reducesd_round ((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), (int)(C), (__mmask8)(U), (int)(R)))
+
+#define _mm_mask_reduce_round_sd(W, U, A, B, C, R) \
+ ((__m128d) __builtin_ia32_reducesd_mask_round ((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_reduce_round_sd(U, A, B, C, R) \
+ ((__m128d) __builtin_ia32_reducesd_mask_round ((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_reduce_ss(A, B, C) \
+ ((__m128) __builtin_ia32_reducess_mask ((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), \
+ (__mmask8)-1))
+
+#define _mm_mask_reduce_ss(W, U, A, B, C) \
+ ((__m128) __builtin_ia32_reducess_mask ((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), (__mmask8)(U)))
+
+#define _mm_maskz_reduce_ss(U, A, B, C) \
+ ((__m128) __builtin_ia32_reducess_mask ((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), \
+ (__mmask8)(U)))
+
+#define _mm_reduce_round_ss(A, B, C, R) \
+ ((__m128) __builtin_ia32_reducess_round ((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), (int)(C), (__mmask8)(U), (int)(R)))
+
+#define _mm_mask_reduce_round_ss(W, U, A, B, C, R) \
+ ((__m128) __builtin_ia32_reducess_mask_round ((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), \
+ (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_reduce_round_ss(U, A, B, C, R) \
+ ((__m128) __builtin_ia32_reducesd_mask_round ((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), \
+ (__mmask8)(U), (int)(R)))
+
+
+#endif
+
+#ifdef __DISABLE_AVX512DQ__
+#undef __DISABLE_AVX512DQ__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512DQ__ */
+
+#endif /* _AVX512DQINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2013-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512erintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512ERINTRIN_H_INCLUDED
+#define _AVX512ERINTRIN_H_INCLUDED
+
+#ifndef __AVX512ER__
+#pragma GCC push_options
+#pragma GCC target("avx512er")
+#define __DISABLE_AVX512ER__
+#endif /* __AVX512ER__ */
+
+/* Internal data types for implementing the intrinsics. */
+typedef double __v8df __attribute__ ((__vector_size__ (64)));
+typedef float __v16sf __attribute__ ((__vector_size__ (64)));
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+ vector types, and their scalar components. */
+typedef float __m512 __attribute__ ((__vector_size__ (64), __may_alias__));
+typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__));
+
+typedef unsigned char __mmask8;
+typedef unsigned short __mmask16;
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_exp2a23_round_pd (__m512d __A, int __R)
+{
+ return (__m512d) __builtin_ia32_exp2pd_mask ((__v8df) __A,
+ (__v8df) _mm512_undefined_pd (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_exp2a23_round_pd (__m512d __W, __mmask8 __U, __m512d __A, int __R)
+{
+ return (__m512d) __builtin_ia32_exp2pd_mask ((__v8df) __A,
+ (__v8df) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_exp2a23_round_pd (__mmask8 __U, __m512d __A, int __R)
+{
+ return (__m512d) __builtin_ia32_exp2pd_mask ((__v8df) __A,
+ (__v8df) _mm512_setzero_pd (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_exp2a23_round_ps (__m512 __A, int __R)
+{
+ return (__m512) __builtin_ia32_exp2ps_mask ((__v16sf) __A,
+ (__v16sf) _mm512_undefined_ps (),
+ (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_exp2a23_round_ps (__m512 __W, __mmask16 __U, __m512 __A, int __R)
+{
+ return (__m512) __builtin_ia32_exp2ps_mask ((__v16sf) __A,
+ (__v16sf) __W,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_exp2a23_round_ps (__mmask16 __U, __m512 __A, int __R)
+{
+ return (__m512) __builtin_ia32_exp2ps_mask ((__v16sf) __A,
+ (__v16sf) _mm512_setzero_ps (),
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rcp28_round_pd (__m512d __A, int __R)
+{
+ return (__m512d) __builtin_ia32_rcp28pd_mask ((__v8df) __A,
+ (__v8df) _mm512_undefined_pd (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rcp28_round_pd (__m512d __W, __mmask8 __U, __m512d __A, int __R)
+{
+ return (__m512d) __builtin_ia32_rcp28pd_mask ((__v8df) __A,
+ (__v8df) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rcp28_round_pd (__mmask8 __U, __m512d __A, int __R)
+{
+ return (__m512d) __builtin_ia32_rcp28pd_mask ((__v8df) __A,
+ (__v8df) _mm512_setzero_pd (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rcp28_round_ps (__m512 __A, int __R)
+{
+ return (__m512) __builtin_ia32_rcp28ps_mask ((__v16sf) __A,
+ (__v16sf) _mm512_undefined_ps (),
+ (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rcp28_round_ps (__m512 __W, __mmask16 __U, __m512 __A, int __R)
+{
+ return (__m512) __builtin_ia32_rcp28ps_mask ((__v16sf) __A,
+ (__v16sf) __W,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rcp28_round_ps (__mmask16 __U, __m512 __A, int __R)
+{
+ return (__m512) __builtin_ia32_rcp28ps_mask ((__v16sf) __A,
+ (__v16sf) _mm512_setzero_ps (),
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp28_round_sd (__m128d __A, __m128d __B, int __R)
+{
+ return (__m128d) __builtin_ia32_rcp28sd_round ((__v2df) __B,
+ (__v2df) __A,
+ __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rcp28_round_sd (__m128d __W, __mmask8 __U, __m128d __A,
+ __m128d __B, int __R)
+{
+ return (__m128d) __builtin_ia32_rcp28sd_mask_round ((__v2df) __B,
+ (__v2df) __A,
+ (__v2df) __W,
+ __U,
+ __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rcp28_round_sd (__mmask8 __U, __m128d __A, __m128d __B, int __R)
+{
+ return (__m128d) __builtin_ia32_rcp28sd_mask_round ((__v2df) __B,
+ (__v2df) __A,
+ (__v2df)
+ _mm_setzero_pd (),
+ __U,
+ __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp28_round_ss (__m128 __A, __m128 __B, int __R)
+{
+ return (__m128) __builtin_ia32_rcp28ss_round ((__v4sf) __B,
+ (__v4sf) __A,
+ __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rcp28_round_ss (__m128 __W, __mmask8 __U, __m128 __A,
+ __m128 __B, int __R)
+{
+ return (__m128) __builtin_ia32_rcp28ss_mask_round ((__v4sf) __B,
+ (__v4sf) __A,
+ (__v4sf) __W,
+ __U,
+ __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rcp28_round_ss (__mmask8 __U, __m128 __A, __m128 __B, int __R)
+{
+ return (__m128) __builtin_ia32_rcp28ss_mask_round ((__v4sf) __B,
+ (__v4sf) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ __U,
+ __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rsqrt28_round_pd (__m512d __A, int __R)
+{
+ return (__m512d) __builtin_ia32_rsqrt28pd_mask ((__v8df) __A,
+ (__v8df) _mm512_undefined_pd (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rsqrt28_round_pd (__m512d __W, __mmask8 __U, __m512d __A, int __R)
+{
+ return (__m512d) __builtin_ia32_rsqrt28pd_mask ((__v8df) __A,
+ (__v8df) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rsqrt28_round_pd (__mmask8 __U, __m512d __A, int __R)
+{
+ return (__m512d) __builtin_ia32_rsqrt28pd_mask ((__v8df) __A,
+ (__v8df) _mm512_setzero_pd (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rsqrt28_round_ps (__m512 __A, int __R)
+{
+ return (__m512) __builtin_ia32_rsqrt28ps_mask ((__v16sf) __A,
+ (__v16sf) _mm512_undefined_ps (),
+ (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rsqrt28_round_ps (__m512 __W, __mmask16 __U, __m512 __A, int __R)
+{
+ return (__m512) __builtin_ia32_rsqrt28ps_mask ((__v16sf) __A,
+ (__v16sf) __W,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rsqrt28_round_ps (__mmask16 __U, __m512 __A, int __R)
+{
+ return (__m512) __builtin_ia32_rsqrt28ps_mask ((__v16sf) __A,
+ (__v16sf) _mm512_setzero_ps (),
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt28_round_sd (__m128d __A, __m128d __B, int __R)
+{
+ return (__m128d) __builtin_ia32_rsqrt28sd_round ((__v2df) __B,
+ (__v2df) __A,
+ __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rsqrt28_round_sd (__m128d __W, __mmask8 __U, __m128d __A,
+ __m128d __B, int __R)
+{
+ return (__m128d) __builtin_ia32_rsqrt28sd_mask_round ((__v2df) __B,
+ (__v2df) __A,
+ (__v2df) __W,
+ __U,
+ __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rsqrt28_round_sd (__mmask8 __U, __m128d __A, __m128d __B, int __R)
+{
+ return (__m128d) __builtin_ia32_rsqrt28sd_mask_round ((__v2df) __B,
+ (__v2df) __A,
+ (__v2df)
+ _mm_setzero_pd (),
+ __U,
+ __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt28_round_ss (__m128 __A, __m128 __B, int __R)
+{
+ return (__m128) __builtin_ia32_rsqrt28ss_round ((__v4sf) __B,
+ (__v4sf) __A,
+ __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rsqrt28_round_ss (__m128 __W, __mmask8 __U, __m128 __A,
+ __m128 __B, int __R)
+{
+ return (__m128) __builtin_ia32_rsqrt28ss_mask_round ((__v4sf) __B,
+ (__v4sf) __A,
+ (__v4sf) __W,
+ __U,
+ __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rsqrt28_round_ss (__mmask8 __U, __m128 __A, __m128 __B, int __R)
+{
+ return (__m128) __builtin_ia32_rsqrt28ss_mask_round ((__v4sf) __B,
+ (__v4sf) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ __U,
+ __R);
+}
+
+#else
+#define _mm512_exp2a23_round_pd(A, C) \
+ __builtin_ia32_exp2pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C)
+
+#define _mm512_mask_exp2a23_round_pd(W, U, A, C) \
+ __builtin_ia32_exp2pd_mask(A, W, U, C)
+
+#define _mm512_maskz_exp2a23_round_pd(U, A, C) \
+ __builtin_ia32_exp2pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C)
+
+#define _mm512_exp2a23_round_ps(A, C) \
+ __builtin_ia32_exp2ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C)
+
+#define _mm512_mask_exp2a23_round_ps(W, U, A, C) \
+ __builtin_ia32_exp2ps_mask(A, W, U, C)
+
+#define _mm512_maskz_exp2a23_round_ps(U, A, C) \
+ __builtin_ia32_exp2ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C)
+
+#define _mm512_rcp28_round_pd(A, C) \
+ __builtin_ia32_rcp28pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C)
+
+#define _mm512_mask_rcp28_round_pd(W, U, A, C) \
+ __builtin_ia32_rcp28pd_mask(A, W, U, C)
+
+#define _mm512_maskz_rcp28_round_pd(U, A, C) \
+ __builtin_ia32_rcp28pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C)
+
+#define _mm512_rcp28_round_ps(A, C) \
+ __builtin_ia32_rcp28ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C)
+
+#define _mm512_mask_rcp28_round_ps(W, U, A, C) \
+ __builtin_ia32_rcp28ps_mask(A, W, U, C)
+
+#define _mm512_maskz_rcp28_round_ps(U, A, C) \
+ __builtin_ia32_rcp28ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C)
+
+#define _mm512_rsqrt28_round_pd(A, C) \
+ __builtin_ia32_rsqrt28pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C)
+
+#define _mm512_mask_rsqrt28_round_pd(W, U, A, C) \
+ __builtin_ia32_rsqrt28pd_mask(A, W, U, C)
+
+#define _mm512_maskz_rsqrt28_round_pd(U, A, C) \
+ __builtin_ia32_rsqrt28pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C)
+
+#define _mm512_rsqrt28_round_ps(A, C) \
+ __builtin_ia32_rsqrt28ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C)
+
+#define _mm512_mask_rsqrt28_round_ps(W, U, A, C) \
+ __builtin_ia32_rsqrt28ps_mask(A, W, U, C)
+
+#define _mm512_maskz_rsqrt28_round_ps(U, A, C) \
+ __builtin_ia32_rsqrt28ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C)
+
+#define _mm_rcp28_round_sd(A, B, R) \
+ __builtin_ia32_rcp28sd_round(A, B, R)
+
+#define _mm_mask_rcp28_round_sd(W, U, A, B, R) \
+ __builtin_ia32_rcp28sd_mask_round ((A), (B), (W), (U), (R))
+
+#define _mm_maskz_rcp28_round_sd(U, A, B, R) \
+ __builtin_ia32_rcp28sd_mask_round ((A), (B), (__v2df) _mm_setzero_pd (), \
+ (U), (R))
+
+#define _mm_rcp28_round_ss(A, B, R) \
+ __builtin_ia32_rcp28ss_round(A, B, R)
+
+#define _mm_mask_rcp28_round_ss(W, U, A, B, R) \
+ __builtin_ia32_rcp28ss_mask_round ((A), (B), (W), (U), (R))
+
+#define _mm_maskz_rcp28_round_ss(U, A, B, R) \
+ __builtin_ia32_rcp28ss_mask_round ((A), (B), (__v4sf) _mm_setzero_ps (), \
+ (U), (R))
+
+#define _mm_rsqrt28_round_sd(A, B, R) \
+ __builtin_ia32_rsqrt28sd_round(A, B, R)
+
+#define _mm_mask_rsqrt28_round_sd(W, U, A, B, R) \
+ __builtin_ia32_rsqrt28sd_mask_round ((A), (B), (W), (U), (R))
+
+#define _mm_maskz_rsqrt28_round_sd(U, A, B, R) \
+ __builtin_ia32_rsqrt28sd_mask_round ((A), (B), (__v2df) _mm_setzero_pd (),\
+ (U), (R))
+
+#define _mm_rsqrt28_round_ss(A, B, R) \
+ __builtin_ia32_rsqrt28ss_round(A, B, R)
+
+#define _mm_mask_rsqrt28_round_ss(W, U, A, B, R) \
+ __builtin_ia32_rsqrt28ss_mask_round ((A), (B), (W), (U), (R))
+
+#define _mm_maskz_rsqrt28_round_ss(U, A, B, R) \
+ __builtin_ia32_rsqrt28ss_mask_round ((A), (B), (__v4sf) _mm_setzero_ps (),\
+ (U), (R))
+
+#endif
+
+#define _mm_mask_rcp28_sd(W, U, A, B)\
+ _mm_mask_rcp28_round_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rcp28_sd(U, A, B)\
+ _mm_maskz_rcp28_round_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rcp28_ss(W, U, A, B)\
+ _mm_mask_rcp28_round_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rcp28_ss(U, A, B)\
+ _mm_maskz_rcp28_round_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rsqrt28_sd(W, U, A, B)\
+ _mm_mask_rsqrt28_round_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rsqrt28_sd(U, A, B)\
+ _mm_maskz_rsqrt28_round_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rsqrt28_ss(W, U, A, B)\
+ _mm_mask_rsqrt28_round_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rsqrt28_ss(U, A, B)\
+ _mm_maskz_rsqrt28_round_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_exp2a23_pd(A) \
+ _mm512_exp2a23_round_pd(A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_exp2a23_pd(W, U, A) \
+ _mm512_mask_exp2a23_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_exp2a23_pd(U, A) \
+ _mm512_maskz_exp2a23_round_pd(U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_exp2a23_ps(A) \
+ _mm512_exp2a23_round_ps(A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_exp2a23_ps(W, U, A) \
+ _mm512_mask_exp2a23_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_exp2a23_ps(U, A) \
+ _mm512_maskz_exp2a23_round_ps(U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_rcp28_pd(A) \
+ _mm512_rcp28_round_pd(A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rcp28_pd(W, U, A) \
+ _mm512_mask_rcp28_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rcp28_pd(U, A) \
+ _mm512_maskz_rcp28_round_pd(U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_rcp28_ps(A) \
+ _mm512_rcp28_round_ps(A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rcp28_ps(W, U, A) \
+ _mm512_mask_rcp28_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rcp28_ps(U, A) \
+ _mm512_maskz_rcp28_round_ps(U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_rsqrt28_pd(A) \
+ _mm512_rsqrt28_round_pd(A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rsqrt28_pd(W, U, A) \
+ _mm512_mask_rsqrt28_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rsqrt28_pd(U, A) \
+ _mm512_maskz_rsqrt28_round_pd(U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_rsqrt28_ps(A) \
+ _mm512_rsqrt28_round_ps(A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rsqrt28_ps(W, U, A) \
+ _mm512_mask_rsqrt28_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rsqrt28_ps(U, A) \
+ _mm512_maskz_rsqrt28_round_ps(U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rcp28_sd(A, B) \
+ __builtin_ia32_rcp28sd_round(B, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rcp28_ss(A, B) \
+ __builtin_ia32_rcp28ss_round(B, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rsqrt28_sd(A, B) \
+ __builtin_ia32_rsqrt28sd_round(B, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rsqrt28_ss(A, B) \
+ __builtin_ia32_rsqrt28ss_round(B, A, _MM_FROUND_CUR_DIRECTION)
+
+#ifdef __DISABLE_AVX512ER__
+#undef __DISABLE_AVX512ER__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512ER__ */
+
+#endif /* _AVX512ERINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2013-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512fintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512FINTRIN_H_INCLUDED
+#define _AVX512FINTRIN_H_INCLUDED
+
+#ifndef __AVX512F__
+#pragma GCC push_options
+#pragma GCC target("avx512f")
+#define __DISABLE_AVX512F__
+#endif /* __AVX512F__ */
+
+/* Internal data types for implementing the intrinsics. */
+typedef double __v8df __attribute__ ((__vector_size__ (64)));
+typedef float __v16sf __attribute__ ((__vector_size__ (64)));
+typedef long long __v8di __attribute__ ((__vector_size__ (64)));
+typedef unsigned long long __v8du __attribute__ ((__vector_size__ (64)));
+typedef int __v16si __attribute__ ((__vector_size__ (64)));
+typedef unsigned int __v16su __attribute__ ((__vector_size__ (64)));
+typedef short __v32hi __attribute__ ((__vector_size__ (64)));
+typedef unsigned short __v32hu __attribute__ ((__vector_size__ (64)));
+typedef char __v64qi __attribute__ ((__vector_size__ (64)));
+typedef unsigned char __v64qu __attribute__ ((__vector_size__ (64)));
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+ vector types, and their scalar components. */
+typedef float __m512 __attribute__ ((__vector_size__ (64), __may_alias__));
+typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__));
+typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__));
+
+/* Unaligned version of the same type. */
+typedef float __m512_u __attribute__ ((__vector_size__ (64), __may_alias__, __aligned__ (1)));
+typedef long long __m512i_u __attribute__ ((__vector_size__ (64), __may_alias__, __aligned__ (1)));
+typedef double __m512d_u __attribute__ ((__vector_size__ (64), __may_alias__, __aligned__ (1)));
+
+typedef unsigned char __mmask8;
+typedef unsigned short __mmask16;
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_int2mask (int __M)
+{
+ return (__mmask16) __M;
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask2int (__mmask16 __M)
+{
+ return (int) __M;
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set_epi64 (long long __A, long long __B, long long __C,
+ long long __D, long long __E, long long __F,
+ long long __G, long long __H)
+{
+ return __extension__ (__m512i) (__v8di)
+ { __H, __G, __F, __E, __D, __C, __B, __A };
+}
+
+/* Create the vector [A B C D E F G H I J K L M N O P]. */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set_epi32 (int __A, int __B, int __C, int __D,
+ int __E, int __F, int __G, int __H,
+ int __I, int __J, int __K, int __L,
+ int __M, int __N, int __O, int __P)
+{
+ return __extension__ (__m512i)(__v16si)
+ { __P, __O, __N, __M, __L, __K, __J, __I,
+ __H, __G, __F, __E, __D, __C, __B, __A };
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set_epi16 (short __q31, short __q30, short __q29, short __q28,
+ short __q27, short __q26, short __q25, short __q24,
+ short __q23, short __q22, short __q21, short __q20,
+ short __q19, short __q18, short __q17, short __q16,
+ short __q15, short __q14, short __q13, short __q12,
+ short __q11, short __q10, short __q09, short __q08,
+ short __q07, short __q06, short __q05, short __q04,
+ short __q03, short __q02, short __q01, short __q00)
+{
+ return __extension__ (__m512i)(__v32hi){
+ __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
+ __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
+ __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
+ __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31
+ };
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set_epi8 (char __q63, char __q62, char __q61, char __q60,
+ char __q59, char __q58, char __q57, char __q56,
+ char __q55, char __q54, char __q53, char __q52,
+ char __q51, char __q50, char __q49, char __q48,
+ char __q47, char __q46, char __q45, char __q44,
+ char __q43, char __q42, char __q41, char __q40,
+ char __q39, char __q38, char __q37, char __q36,
+ char __q35, char __q34, char __q33, char __q32,
+ char __q31, char __q30, char __q29, char __q28,
+ char __q27, char __q26, char __q25, char __q24,
+ char __q23, char __q22, char __q21, char __q20,
+ char __q19, char __q18, char __q17, char __q16,
+ char __q15, char __q14, char __q13, char __q12,
+ char __q11, char __q10, char __q09, char __q08,
+ char __q07, char __q06, char __q05, char __q04,
+ char __q03, char __q02, char __q01, char __q00)
+{
+ return __extension__ (__m512i)(__v64qi){
+ __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
+ __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
+ __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
+ __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31,
+ __q32, __q33, __q34, __q35, __q36, __q37, __q38, __q39,
+ __q40, __q41, __q42, __q43, __q44, __q45, __q46, __q47,
+ __q48, __q49, __q50, __q51, __q52, __q53, __q54, __q55,
+ __q56, __q57, __q58, __q59, __q60, __q61, __q62, __q63
+ };
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set_pd (double __A, double __B, double __C, double __D,
+ double __E, double __F, double __G, double __H)
+{
+ return __extension__ (__m512d)
+ { __H, __G, __F, __E, __D, __C, __B, __A };
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set_ps (float __A, float __B, float __C, float __D,
+ float __E, float __F, float __G, float __H,
+ float __I, float __J, float __K, float __L,
+ float __M, float __N, float __O, float __P)
+{
+ return __extension__ (__m512)
+ { __P, __O, __N, __M, __L, __K, __J, __I,
+ __H, __G, __F, __E, __D, __C, __B, __A };
+}
+
+#define _mm512_setr_epi64(e0,e1,e2,e3,e4,e5,e6,e7) \
+ _mm512_set_epi64(e7,e6,e5,e4,e3,e2,e1,e0)
+
+#define _mm512_setr_epi32(e0,e1,e2,e3,e4,e5,e6,e7, \
+ e8,e9,e10,e11,e12,e13,e14,e15) \
+ _mm512_set_epi32(e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0)
+
+#define _mm512_setr_pd(e0,e1,e2,e3,e4,e5,e6,e7) \
+ _mm512_set_pd(e7,e6,e5,e4,e3,e2,e1,e0)
+
+#define _mm512_setr_ps(e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15) \
+ _mm512_set_ps(e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0)
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_undefined_ps (void)
+{
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Winit-self"
+ __m512 __Y = __Y;
+#pragma GCC diagnostic pop
+ return __Y;
+}
+
+#define _mm512_undefined _mm512_undefined_ps
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_undefined_pd (void)
+{
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Winit-self"
+ __m512d __Y = __Y;
+#pragma GCC diagnostic pop
+ return __Y;
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_undefined_epi32 (void)
+{
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Winit-self"
+ __m512i __Y = __Y;
+#pragma GCC diagnostic pop
+ return __Y;
+}
+
+#define _mm512_undefined_si512 _mm512_undefined_epi32
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set1_epi8 (char __A)
+{
+ return __extension__ (__m512i)(__v64qi)
+ { __A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A };
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set1_epi16 (short __A)
+{
+ return __extension__ (__m512i)(__v32hi)
+ { __A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A };
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set1_pd (double __A)
+{
+ return __extension__ (__m512d)(__v8df)
+ { __A, __A, __A, __A, __A, __A, __A, __A };
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set1_ps (float __A)
+{
+ return __extension__ (__m512)(__v16sf)
+ { __A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A };
+}
+
+/* Create the vector [A B C D A B C D A B C D A B C D]. */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set4_epi32 (int __A, int __B, int __C, int __D)
+{
+ return __extension__ (__m512i)(__v16si)
+ { __D, __C, __B, __A, __D, __C, __B, __A,
+ __D, __C, __B, __A, __D, __C, __B, __A };
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set4_epi64 (long long __A, long long __B, long long __C,
+ long long __D)
+{
+ return __extension__ (__m512i) (__v8di)
+ { __D, __C, __B, __A, __D, __C, __B, __A };
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set4_pd (double __A, double __B, double __C, double __D)
+{
+ return __extension__ (__m512d)
+ { __D, __C, __B, __A, __D, __C, __B, __A };
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set4_ps (float __A, float __B, float __C, float __D)
+{
+ return __extension__ (__m512)
+ { __D, __C, __B, __A, __D, __C, __B, __A,
+ __D, __C, __B, __A, __D, __C, __B, __A };
+}
+
+#define _mm512_setr4_epi64(e0,e1,e2,e3) \
+ _mm512_set4_epi64(e3,e2,e1,e0)
+
+#define _mm512_setr4_epi32(e0,e1,e2,e3) \
+ _mm512_set4_epi32(e3,e2,e1,e0)
+
+#define _mm512_setr4_pd(e0,e1,e2,e3) \
+ _mm512_set4_pd(e3,e2,e1,e0)
+
+#define _mm512_setr4_ps(e0,e1,e2,e3) \
+ _mm512_set4_ps(e3,e2,e1,e0)
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_setzero_ps (void)
+{
+ return __extension__ (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_setzero (void)
+{
+ return _mm512_setzero_ps ();
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_setzero_pd (void)
+{
+ return __extension__ (__m512d) { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_setzero_epi32 (void)
+{
+ return __extension__ (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_setzero_si512 (void)
+{
+ return __extension__ (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+ return (__m512d) __builtin_ia32_movapd512_mask ((__v8df) __A,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mov_pd (__mmask8 __U, __m512d __A)
+{
+ return (__m512d) __builtin_ia32_movapd512_mask ((__v8df) __A,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_movaps512_mask ((__v16sf) __A,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mov_ps (__mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_movaps512_mask ((__v16sf) __A,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_load_pd (void const *__P)
+{
+ return *(__m512d *) __P;
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_load_pd (__m512d __W, __mmask8 __U, void const *__P)
+{
+ return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_load_pd (__mmask8 __U, void const *__P)
+{
+ return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_store_pd (void *__P, __m512d __A)
+{
+ *(__m512d *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_store_pd (void *__P, __mmask8 __U, __m512d __A)
+{
+ __builtin_ia32_storeapd512_mask ((__v8df *) __P, (__v8df) __A,
+ (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_load_ps (void const *__P)
+{
+ return *(__m512 *) __P;
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_load_ps (__m512 __W, __mmask16 __U, void const *__P)
+{
+ return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_load_ps (__mmask16 __U, void const *__P)
+{
+ return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_store_ps (void *__P, __m512 __A)
+{
+ *(__m512 *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_store_ps (void *__P, __mmask16 __U, __m512 __A)
+{
+ __builtin_ia32_storeaps512_mask ((__v16sf *) __P, (__v16sf) __A,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mov_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_movdqa64_512_mask ((__v8di) __A,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mov_epi64 (__mmask8 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_movdqa64_512_mask ((__v8di) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_load_epi64 (void const *__P)
+{
+ return *(__m512i *) __P;
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_load_epi64 (__m512i __W, __mmask8 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_load_epi64 (__mmask8 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_store_epi64 (void *__P, __m512i __A)
+{
+ *(__m512i *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_store_epi64 (void *__P, __mmask8 __U, __m512i __A)
+{
+ __builtin_ia32_movdqa64store512_mask ((__v8di *) __P, (__v8di) __A,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mov_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_movdqa32_512_mask ((__v16si) __A,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mov_epi32 (__mmask16 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_movdqa32_512_mask ((__v16si) __A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_load_si512 (void const *__P)
+{
+ return *(__m512i *) __P;
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_load_epi32 (void const *__P)
+{
+ return *(__m512i *) __P;
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_load_epi32 (__mmask16 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_store_si512 (void *__P, __m512i __A)
+{
+ *(__m512i *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_store_epi32 (void *__P, __m512i __A)
+{
+ *(__m512i *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_store_epi32 (void *__P, __mmask16 __U, __m512i __A)
+{
+ __builtin_ia32_movdqa32store512_mask ((__v16si *) __P, (__v16si) __A,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mullo_epi32 (__m512i __A, __m512i __B)
+{
+ return (__m512i) ((__v16su) __A * (__v16su) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mullo_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mullo_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si) __W, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mullox_epi64 (__m512i __A, __m512i __B)
+{
+ return (__m512i) ((__v8du) __A * (__v8du) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mullox_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+ return _mm512_mask_mov_epi64 (__W, __M, _mm512_mullox_epi64 (__A, __B));
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sllv_epi32 (__m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X,
+ (__v16si) __Y,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sllv_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X,
+ (__v16si) __Y,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sllv_epi32 (__mmask16 __U, __m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X,
+ (__v16si) __Y,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srav_epi32 (__m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X,
+ (__v16si) __Y,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srav_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X,
+ (__v16si) __Y,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srav_epi32 (__mmask16 __U, __m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X,
+ (__v16si) __Y,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srlv_epi32 (__m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X,
+ (__v16si) __Y,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srlv_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X,
+ (__v16si) __Y,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srlv_epi32 (__mmask16 __U, __m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X,
+ (__v16si) __Y,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_add_epi64 (__m512i __A, __m512i __B)
+{
+ return (__m512i) ((__v8du) __A + (__v8du) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_add_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_add_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sub_epi64 (__m512i __A, __m512i __B)
+{
+ return (__m512i) ((__v8du) __A - (__v8du) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sub_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sub_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sllv_epi64 (__m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X,
+ (__v8di) __Y,
+ (__v8di)
+ _mm512_undefined_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sllv_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X,
+ (__v8di) __Y,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sllv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X,
+ (__v8di) __Y,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srav_epi64 (__m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X,
+ (__v8di) __Y,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srav_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X,
+ (__v8di) __Y,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srav_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X,
+ (__v8di) __Y,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srlv_epi64 (__m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X,
+ (__v8di) __Y,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srlv_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X,
+ (__v8di) __Y,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srlv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X,
+ (__v8di) __Y,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_add_epi32 (__m512i __A, __m512i __B)
+{
+ return (__m512i) ((__v16su) __A + (__v16su) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_add_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mul_epi32 (__m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
+ (__v16si) __Y,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mul_epi32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
+ (__v16si) __Y,
+ (__v8di) __W, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mul_epi32 (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
+ (__v16si) __Y,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sub_epi32 (__m512i __A, __m512i __B)
+{
+ return (__m512i) ((__v16su) __A - (__v16su) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sub_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sub_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mul_epu32 (__m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
+ (__v16si) __Y,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mul_epu32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
+ (__v16si) __Y,
+ (__v8di) __W, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mul_epu32 (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
+ (__v16si) __Y,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ __M);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_slli_epi64 (__m512i __A, unsigned int __B)
+{
+ return (__m512i) __builtin_ia32_psllqi512_mask ((__v8di) __A, __B,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_slli_epi64 (__m512i __W, __mmask8 __U, __m512i __A,
+ unsigned int __B)
+{
+ return (__m512i) __builtin_ia32_psllqi512_mask ((__v8di) __A, __B,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_slli_epi64 (__mmask8 __U, __m512i __A, unsigned int __B)
+{
+ return (__m512i) __builtin_ia32_psllqi512_mask ((__v8di) __A, __B,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+#else
+#define _mm512_slli_epi64(X, C) \
+ ((__m512i) __builtin_ia32_psllqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
+ (__v8di)(__m512i)_mm512_undefined_epi32 (),\
+ (__mmask8)-1))
+
+#define _mm512_mask_slli_epi64(W, U, X, C) \
+ ((__m512i) __builtin_ia32_psllqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
+ (__v8di)(__m512i)(W),\
+ (__mmask8)(U)))
+
+#define _mm512_maskz_slli_epi64(U, X, C) \
+ ((__m512i) __builtin_ia32_psllqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
+ (__v8di)(__m512i)_mm512_setzero_si512 (),\
+ (__mmask8)(U)))
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sll_epi64 (__m512i __A, __m128i __B)
+{
+ return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A,
+ (__v2di) __B,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sll_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A,
+ (__v2di) __B,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sll_epi64 (__mmask8 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A,
+ (__v2di) __B,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srli_epi64 (__m512i __A, unsigned int __B)
+{
+ return (__m512i) __builtin_ia32_psrlqi512_mask ((__v8di) __A, __B,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srli_epi64 (__m512i __W, __mmask8 __U,
+ __m512i __A, unsigned int __B)
+{
+ return (__m512i) __builtin_ia32_psrlqi512_mask ((__v8di) __A, __B,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srli_epi64 (__mmask8 __U, __m512i __A, unsigned int __B)
+{
+ return (__m512i) __builtin_ia32_psrlqi512_mask ((__v8di) __A, __B,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+#else
+#define _mm512_srli_epi64(X, C) \
+ ((__m512i) __builtin_ia32_psrlqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
+ (__v8di)(__m512i)_mm512_undefined_epi32 (),\
+ (__mmask8)-1))
+
+#define _mm512_mask_srli_epi64(W, U, X, C) \
+ ((__m512i) __builtin_ia32_psrlqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
+ (__v8di)(__m512i)(W),\
+ (__mmask8)(U)))
+
+#define _mm512_maskz_srli_epi64(U, X, C) \
+ ((__m512i) __builtin_ia32_psrlqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
+ (__v8di)(__m512i)_mm512_setzero_si512 (),\
+ (__mmask8)(U)))
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srl_epi64 (__m512i __A, __m128i __B)
+{
+ return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A,
+ (__v2di) __B,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srl_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A,
+ (__v2di) __B,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srl_epi64 (__mmask8 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A,
+ (__v2di) __B,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srai_epi64 (__m512i __A, unsigned int __B)
+{
+ return (__m512i) __builtin_ia32_psraqi512_mask ((__v8di) __A, __B,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srai_epi64 (__m512i __W, __mmask8 __U, __m512i __A,
+ unsigned int __B)
+{
+ return (__m512i) __builtin_ia32_psraqi512_mask ((__v8di) __A, __B,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srai_epi64 (__mmask8 __U, __m512i __A, unsigned int __B)
+{
+ return (__m512i) __builtin_ia32_psraqi512_mask ((__v8di) __A, __B,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+#else
+#define _mm512_srai_epi64(X, C) \
+ ((__m512i) __builtin_ia32_psraqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
+ (__v8di)(__m512i)_mm512_undefined_epi32 (),\
+ (__mmask8)-1))
+
+#define _mm512_mask_srai_epi64(W, U, X, C) \
+ ((__m512i) __builtin_ia32_psraqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
+ (__v8di)(__m512i)(W),\
+ (__mmask8)(U)))
+
+#define _mm512_maskz_srai_epi64(U, X, C) \
+ ((__m512i) __builtin_ia32_psraqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
+ (__v8di)(__m512i)_mm512_setzero_si512 (),\
+ (__mmask8)(U)))
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sra_epi64 (__m512i __A, __m128i __B)
+{
+ return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A,
+ (__v2di) __B,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sra_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A,
+ (__v2di) __B,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sra_epi64 (__mmask8 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A,
+ (__v2di) __B,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_slli_epi32 (__m512i __A, unsigned int __B)
+{
+ return (__m512i) __builtin_ia32_pslldi512_mask ((__v16si) __A, __B,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_slli_epi32 (__m512i __W, __mmask16 __U, __m512i __A,
+ unsigned int __B)
+{
+ return (__m512i) __builtin_ia32_pslldi512_mask ((__v16si) __A, __B,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_slli_epi32 (__mmask16 __U, __m512i __A, unsigned int __B)
+{
+ return (__m512i) __builtin_ia32_pslldi512_mask ((__v16si) __A, __B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+#else
+#define _mm512_slli_epi32(X, C) \
+ ((__m512i) __builtin_ia32_pslldi512_mask ((__v16si)(__m512i)(X), (int)(C),\
+ (__v16si)(__m512i)_mm512_undefined_epi32 (),\
+ (__mmask16)-1))
+
+#define _mm512_mask_slli_epi32(W, U, X, C) \
+ ((__m512i) __builtin_ia32_pslldi512_mask ((__v16si)(__m512i)(X), (int)(C),\
+ (__v16si)(__m512i)(W),\
+ (__mmask16)(U)))
+
+#define _mm512_maskz_slli_epi32(U, X, C) \
+ ((__m512i) __builtin_ia32_pslldi512_mask ((__v16si)(__m512i)(X), (int)(C),\
+ (__v16si)(__m512i)_mm512_setzero_si512 (),\
+ (__mmask16)(U)))
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sll_epi32 (__m512i __A, __m128i __B)
+{
+ return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A,
+ (__v4si) __B,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sll_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A,
+ (__v4si) __B,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sll_epi32 (__mmask16 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A,
+ (__v4si) __B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srli_epi32 (__m512i __A, unsigned int __B)
+{
+ return (__m512i) __builtin_ia32_psrldi512_mask ((__v16si) __A, __B,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srli_epi32 (__m512i __W, __mmask16 __U,
+ __m512i __A, unsigned int __B)
+{
+ return (__m512i) __builtin_ia32_psrldi512_mask ((__v16si) __A, __B,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srli_epi32 (__mmask16 __U, __m512i __A, unsigned int __B)
+{
+ return (__m512i) __builtin_ia32_psrldi512_mask ((__v16si) __A, __B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+#else
+#define _mm512_srli_epi32(X, C) \
+ ((__m512i) __builtin_ia32_psrldi512_mask ((__v16si)(__m512i)(X), (int)(C),\
+ (__v16si)(__m512i)_mm512_undefined_epi32 (),\
+ (__mmask16)-1))
+
+#define _mm512_mask_srli_epi32(W, U, X, C) \
+ ((__m512i) __builtin_ia32_psrldi512_mask ((__v16si)(__m512i)(X), (int)(C),\
+ (__v16si)(__m512i)(W),\
+ (__mmask16)(U)))
+
+#define _mm512_maskz_srli_epi32(U, X, C) \
+ ((__m512i) __builtin_ia32_psrldi512_mask ((__v16si)(__m512i)(X), (int)(C),\
+ (__v16si)(__m512i)_mm512_setzero_si512 (),\
+ (__mmask16)(U)))
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srl_epi32 (__m512i __A, __m128i __B)
+{
+ return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A,
+ (__v4si) __B,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srl_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A,
+ (__v4si) __B,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srl_epi32 (__mmask16 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A,
+ (__v4si) __B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srai_epi32 (__m512i __A, unsigned int __B)
+{
+ return (__m512i) __builtin_ia32_psradi512_mask ((__v16si) __A, __B,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srai_epi32 (__m512i __W, __mmask16 __U, __m512i __A,
+ unsigned int __B)
+{
+ return (__m512i) __builtin_ia32_psradi512_mask ((__v16si) __A, __B,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srai_epi32 (__mmask16 __U, __m512i __A, unsigned int __B)
+{
+ return (__m512i) __builtin_ia32_psradi512_mask ((__v16si) __A, __B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+#else
+#define _mm512_srai_epi32(X, C) \
+ ((__m512i) __builtin_ia32_psradi512_mask ((__v16si)(__m512i)(X), (int)(C),\
+ (__v16si)(__m512i)_mm512_undefined_epi32 (),\
+ (__mmask16)-1))
+
+#define _mm512_mask_srai_epi32(W, U, X, C) \
+ ((__m512i) __builtin_ia32_psradi512_mask ((__v16si)(__m512i)(X), (int)(C),\
+ (__v16si)(__m512i)(W),\
+ (__mmask16)(U)))
+
+#define _mm512_maskz_srai_epi32(U, X, C) \
+ ((__m512i) __builtin_ia32_psradi512_mask ((__v16si)(__m512i)(X), (int)(C),\
+ (__v16si)(__m512i)_mm512_setzero_si512 (),\
+ (__mmask16)(U)))
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sra_epi32 (__m512i __A, __m128i __B)
+{
+ return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A,
+ (__v4si) __B,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sra_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A,
+ (__v4si) __B,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sra_epi32 (__mmask16 __U, __m512i __A, __m128i __B)
+{
+ return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A,
+ (__v4si) __B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_round_sd (__m128d __A, __m128d __B, const int __R)
+{
+ return (__m128d) __builtin_ia32_addsd_round ((__v2df) __A,
+ (__v2df) __B,
+ __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_add_round_sd (__m128d __W, __mmask8 __U, __m128d __A,
+ __m128d __B, const int __R)
+{
+ return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_add_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
+ const int __R)
+{
+ return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_round_ss (__m128 __A, __m128 __B, const int __R)
+{
+ return (__m128) __builtin_ia32_addss_round ((__v4sf) __A,
+ (__v4sf) __B,
+ __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_add_round_ss (__m128 __W, __mmask8 __U, __m128 __A,
+ __m128 __B, const int __R)
+{
+ return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_add_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
+ const int __R)
+{
+ return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_round_sd (__m128d __A, __m128d __B, const int __R)
+{
+ return (__m128d) __builtin_ia32_subsd_round ((__v2df) __A,
+ (__v2df) __B,
+ __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sub_round_sd (__m128d __W, __mmask8 __U, __m128d __A,
+ __m128d __B, const int __R)
+{
+ return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sub_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
+ const int __R)
+{
+ return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_round_ss (__m128 __A, __m128 __B, const int __R)
+{
+ return (__m128) __builtin_ia32_subss_round ((__v4sf) __A,
+ (__v4sf) __B,
+ __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sub_round_ss (__m128 __W, __mmask8 __U, __m128 __A,
+ __m128 __B, const int __R)
+{
+ return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sub_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
+ const int __R)
+{
+ return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U, __R);
+}
+
+#else
+#define _mm_add_round_sd(A, B, C) \
+ (__m128d)__builtin_ia32_addsd_round(A, B, C)
+
+#define _mm_mask_add_round_sd(W, U, A, B, C) \
+ (__m128d)__builtin_ia32_addsd_mask_round(A, B, W, U, C)
+
+#define _mm_maskz_add_round_sd(U, A, B, C) \
+ (__m128d)__builtin_ia32_addsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C)
+
+#define _mm_add_round_ss(A, B, C) \
+ (__m128)__builtin_ia32_addss_round(A, B, C)
+
+#define _mm_mask_add_round_ss(W, U, A, B, C) \
+ (__m128)__builtin_ia32_addss_mask_round(A, B, W, U, C)
+
+#define _mm_maskz_add_round_ss(U, A, B, C) \
+ (__m128)__builtin_ia32_addss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C)
+
+#define _mm_sub_round_sd(A, B, C) \
+ (__m128d)__builtin_ia32_subsd_round(A, B, C)
+
+#define _mm_mask_sub_round_sd(W, U, A, B, C) \
+ (__m128d)__builtin_ia32_subsd_mask_round(A, B, W, U, C)
+
+#define _mm_maskz_sub_round_sd(U, A, B, C) \
+ (__m128d)__builtin_ia32_subsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C)
+
+#define _mm_sub_round_ss(A, B, C) \
+ (__m128)__builtin_ia32_subss_round(A, B, C)
+
+#define _mm_mask_sub_round_ss(W, U, A, B, C) \
+ (__m128)__builtin_ia32_subss_mask_round(A, B, W, U, C)
+
+#define _mm_maskz_sub_round_ss(U, A, B, C) \
+ (__m128)__builtin_ia32_subss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C)
+
+#endif
+
+/* Constant helper to represent the ternary logic operations among
+ vector A, B and C. */
+typedef enum
+{
+ _MM_TERNLOG_A = 0xF0,
+ _MM_TERNLOG_B = 0xCC,
+ _MM_TERNLOG_C = 0xAA
+} _MM_TERNLOG_ENUM;
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_ternarylogic_epi64 (__m512i __A, __m512i __B, __m512i __C,
+ const int __imm)
+{
+ return (__m512i)
+ __builtin_ia32_pternlogq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di) __C,
+ (unsigned char) __imm,
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_ternarylogic_epi64 (__m512i __A, __mmask8 __U, __m512i __B,
+ __m512i __C, const int __imm)
+{
+ return (__m512i)
+ __builtin_ia32_pternlogq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di) __C,
+ (unsigned char) __imm,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_ternarylogic_epi64 (__mmask8 __U, __m512i __A, __m512i __B,
+ __m512i __C, const int __imm)
+{
+ return (__m512i)
+ __builtin_ia32_pternlogq512_maskz ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di) __C,
+ (unsigned char) __imm,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_ternarylogic_epi32 (__m512i __A, __m512i __B, __m512i __C,
+ const int __imm)
+{
+ return (__m512i)
+ __builtin_ia32_pternlogd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si) __C,
+ (unsigned char) __imm,
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_ternarylogic_epi32 (__m512i __A, __mmask16 __U, __m512i __B,
+ __m512i __C, const int __imm)
+{
+ return (__m512i)
+ __builtin_ia32_pternlogd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si) __C,
+ (unsigned char) __imm,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_ternarylogic_epi32 (__mmask16 __U, __m512i __A, __m512i __B,
+ __m512i __C, const int __imm)
+{
+ return (__m512i)
+ __builtin_ia32_pternlogd512_maskz ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si) __C,
+ (unsigned char) __imm,
+ (__mmask16) __U);
+}
+#else
+#define _mm512_ternarylogic_epi64(A, B, C, I) \
+ ((__m512i) \
+ __builtin_ia32_pternlogq512_mask ((__v8di) (__m512i) (A), \
+ (__v8di) (__m512i) (B), \
+ (__v8di) (__m512i) (C), \
+ (unsigned char) (I), \
+ (__mmask8) -1))
+#define _mm512_mask_ternarylogic_epi64(A, U, B, C, I) \
+ ((__m512i) \
+ __builtin_ia32_pternlogq512_mask ((__v8di) (__m512i) (A), \
+ (__v8di) (__m512i) (B), \
+ (__v8di) (__m512i) (C), \
+ (unsigned char)(I), \
+ (__mmask8) (U)))
+#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, I) \
+ ((__m512i) \
+ __builtin_ia32_pternlogq512_maskz ((__v8di) (__m512i) (A), \
+ (__v8di) (__m512i) (B), \
+ (__v8di) (__m512i) (C), \
+ (unsigned char) (I), \
+ (__mmask8) (U)))
+#define _mm512_ternarylogic_epi32(A, B, C, I) \
+ ((__m512i) \
+ __builtin_ia32_pternlogd512_mask ((__v16si) (__m512i) (A), \
+ (__v16si) (__m512i) (B), \
+ (__v16si) (__m512i) (C), \
+ (unsigned char) (I), \
+ (__mmask16) -1))
+#define _mm512_mask_ternarylogic_epi32(A, U, B, C, I) \
+ ((__m512i) \
+ __builtin_ia32_pternlogd512_mask ((__v16si) (__m512i) (A), \
+ (__v16si) (__m512i) (B), \
+ (__v16si) (__m512i) (C), \
+ (unsigned char) (I), \
+ (__mmask16) (U)))
+#define _mm512_maskz_ternarylogic_epi32(U, A, B, C, I) \
+ ((__m512i) \
+ __builtin_ia32_pternlogd512_maskz ((__v16si) (__m512i) (A), \
+ (__v16si) (__m512i) (B), \
+ (__v16si) (__m512i) (C), \
+ (unsigned char) (I), \
+ (__mmask16) (U)))
+#endif
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rcp14_pd (__m512d __A)
+{
+ return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rcp14_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+ return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rcp14_pd (__mmask8 __U, __m512d __A)
+{
+ return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rcp14_ps (__m512 __A)
+{
+ return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rcp14_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp14_sd (__m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_rcp14sd ((__v2df) __B,
+ (__v2df) __A);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rcp14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __B,
+ (__v2df) __A,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rcp14_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __B,
+ (__v2df) __A,
+ (__v2df) _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp14_ss (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_rcp14ss ((__v4sf) __B,
+ (__v4sf) __A);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rcp14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __B,
+ (__v4sf) __A,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rcp14_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __B,
+ (__v4sf) __A,
+ (__v4sf) _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rsqrt14_pd (__m512d __A)
+{
+ return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rsqrt14_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+ return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rsqrt14_pd (__mmask8 __U, __m512d __A)
+{
+ return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rsqrt14_ps (__m512 __A)
+{
+ return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rsqrt14_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt14_sd (__m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_rsqrt14sd ((__v2df) __B,
+ (__v2df) __A);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rsqrt14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __B,
+ (__v2df) __A,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rsqrt14_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __B,
+ (__v2df) __A,
+ (__v2df) _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt14_ss (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_rsqrt14ss ((__v4sf) __B,
+ (__v4sf) __A);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rsqrt14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __B,
+ (__v4sf) __A,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rsqrt14_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __B,
+ (__v4sf) __A,
+ (__v4sf) _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sqrt_round_pd (__m512d __A, const int __R)
+{
+ return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sqrt_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
+ const int __R)
+{
+ return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
+ (__v8df) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sqrt_round_pd (__mmask8 __U, __m512d __A, const int __R)
+{
+ return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sqrt_round_ps (__m512 __A, const int __R)
+{
+ return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sqrt_round_ps (__m512 __W, __mmask16 __U, __m512 __A, const int __R)
+{
+ return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A,
+ (__v16sf) __W,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sqrt_round_ps (__mmask16 __U, __m512 __A, const int __R)
+{
+ return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_round_sd (__m128d __A, __m128d __B, const int __R)
+{
+ return (__m128d) __builtin_ia32_sqrtsd_mask_round ((__v2df) __B,
+ (__v2df) __A,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sqrt_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B,
+ const int __R)
+{
+ return (__m128d) __builtin_ia32_sqrtsd_mask_round ((__v2df) __B,
+ (__v2df) __A,
+ (__v2df) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sqrt_round_sd (__mmask8 __U, __m128d __A, __m128d __B, const int __R)
+{
+ return (__m128d) __builtin_ia32_sqrtsd_mask_round ((__v2df) __B,
+ (__v2df) __A,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_round_ss (__m128 __A, __m128 __B, const int __R)
+{
+ return (__m128) __builtin_ia32_sqrtss_mask_round ((__v4sf) __B,
+ (__v4sf) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sqrt_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B,
+ const int __R)
+{
+ return (__m128) __builtin_ia32_sqrtss_mask_round ((__v4sf) __B,
+ (__v4sf) __A,
+ (__v4sf) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sqrt_round_ss (__mmask8 __U, __m128 __A, __m128 __B, const int __R)
+{
+ return (__m128) __builtin_ia32_sqrtss_mask_round ((__v4sf) __B,
+ (__v4sf) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U, __R);
+}
+#else
+#define _mm512_sqrt_round_pd(A, C) \
+ (__m512d)__builtin_ia32_sqrtpd512_mask(A, (__v8df)_mm512_undefined_pd(), -1, C)
+
+#define _mm512_mask_sqrt_round_pd(W, U, A, C) \
+ (__m512d)__builtin_ia32_sqrtpd512_mask(A, W, U, C)
+
+#define _mm512_maskz_sqrt_round_pd(U, A, C) \
+ (__m512d)__builtin_ia32_sqrtpd512_mask(A, (__v8df)_mm512_setzero_pd(), U, C)
+
+#define _mm512_sqrt_round_ps(A, C) \
+ (__m512)__builtin_ia32_sqrtps512_mask(A, (__v16sf)_mm512_undefined_ps(), -1, C)
+
+#define _mm512_mask_sqrt_round_ps(W, U, A, C) \
+ (__m512)__builtin_ia32_sqrtps512_mask(A, W, U, C)
+
+#define _mm512_maskz_sqrt_round_ps(U, A, C) \
+ (__m512)__builtin_ia32_sqrtps512_mask(A, (__v16sf)_mm512_setzero_ps(), U, C)
+
+#define _mm_sqrt_round_sd(A, B, C) \
+ (__m128d)__builtin_ia32_sqrtsd_mask_round (B, A, \
+ (__v2df) _mm_setzero_pd (), -1, C)
+
+#define _mm_mask_sqrt_round_sd(W, U, A, B, C) \
+ (__m128d)__builtin_ia32_sqrtsd_mask_round (B, A, W, U, C)
+
+#define _mm_maskz_sqrt_round_sd(U, A, B, C) \
+ (__m128d)__builtin_ia32_sqrtsd_mask_round (B, A, \
+ (__v2df) _mm_setzero_pd (), U, C)
+
+#define _mm_sqrt_round_ss(A, B, C) \
+ (__m128)__builtin_ia32_sqrtss_mask_round (B, A, \
+ (__v4sf) _mm_setzero_ps (), -1, C)
+
+#define _mm_mask_sqrt_round_ss(W, U, A, B, C) \
+ (__m128)__builtin_ia32_sqrtss_mask_round (B, A, W, U, C)
+
+#define _mm_maskz_sqrt_round_ss(U, A, B, C) \
+ (__m128)__builtin_ia32_sqrtss_mask_round (B, A, \
+ (__v4sf) _mm_setzero_ps (), U, C)
+#endif
+
+#define _mm_mask_sqrt_sd(W, U, A, B) \
+ _mm_mask_sqrt_round_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_sqrt_sd(U, A, B) \
+ _mm_maskz_sqrt_round_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_sqrt_ss(W, U, A, B) \
+ _mm_mask_sqrt_round_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_sqrt_ss(U, A, B) \
+ _mm_maskz_sqrt_round_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi8_epi32 (__m128i __A)
+{
+ return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi8_epi32 (__m512i __W, __mmask16 __U, __m128i __A)
+{
+ return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi8_epi32 (__mmask16 __U, __m128i __A)
+{
+ return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi8_epi64 (__m128i __A)
+{
+ return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi8_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A)
+{
+ return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi16_epi32 (__m256i __A)
+{
+ return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi16_epi32 (__m512i __W, __mmask16 __U, __m256i __A)
+{
+ return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi16_epi32 (__mmask16 __U, __m256i __A)
+{
+ return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi16_epi64 (__m128i __A)
+{
+ return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi16_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A)
+{
+ return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi32_epi64 (__m256i __X)
+{
+ return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi32_epi64 (__m512i __W, __mmask8 __U, __m256i __X)
+{
+ return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi32_epi64 (__mmask8 __U, __m256i __X)
+{
+ return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu8_epi32 (__m128i __A)
+{
+ return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu8_epi32 (__m512i __W, __mmask16 __U, __m128i __A)
+{
+ return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu8_epi32 (__mmask16 __U, __m128i __A)
+{
+ return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu8_epi64 (__m128i __A)
+{
+ return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu8_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A)
+{
+ return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu16_epi32 (__m256i __A)
+{
+ return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu16_epi32 (__m512i __W, __mmask16 __U, __m256i __A)
+{
+ return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu16_epi32 (__mmask16 __U, __m256i __A)
+{
+ return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu16_epi64 (__m128i __A)
+{
+ return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu16_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A)
+{
+ return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu32_epi64 (__m256i __X)
+{
+ return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu32_epi64 (__m512i __W, __mmask8 __U, __m256i __X)
+{
+ return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu32_epi64 (__mmask8 __U, __m256i __X)
+{
+ return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_add_round_pd (__m512d __A, __m512d __B, const int __R)
+{
+ return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_add_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
+ __m512d __B, const int __R)
+{
+ return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_add_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+ const int __R)
+{
+ return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_add_round_ps (__m512 __A, __m512 __B, const int __R)
+{
+ return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_add_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
+ __m512 __B, const int __R)
+{
+ return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __W,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_add_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R)
+{
+ return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sub_round_pd (__m512d __A, __m512d __B, const int __R)
+{
+ return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sub_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
+ __m512d __B, const int __R)
+{
+ return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sub_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+ const int __R)
+{
+ return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sub_round_ps (__m512 __A, __m512 __B, const int __R)
+{
+ return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sub_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
+ __m512 __B, const int __R)
+{
+ return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __W,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sub_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R)
+{
+ return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U, __R);
+}
+#else
+#define _mm512_add_round_pd(A, B, C) \
+ (__m512d)__builtin_ia32_addpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C)
+
+#define _mm512_mask_add_round_pd(W, U, A, B, C) \
+ (__m512d)__builtin_ia32_addpd512_mask(A, B, W, U, C)
+
+#define _mm512_maskz_add_round_pd(U, A, B, C) \
+ (__m512d)__builtin_ia32_addpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C)
+
+#define _mm512_add_round_ps(A, B, C) \
+ (__m512)__builtin_ia32_addps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C)
+
+#define _mm512_mask_add_round_ps(W, U, A, B, C) \
+ (__m512)__builtin_ia32_addps512_mask(A, B, W, U, C)
+
+#define _mm512_maskz_add_round_ps(U, A, B, C) \
+ (__m512)__builtin_ia32_addps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C)
+
+#define _mm512_sub_round_pd(A, B, C) \
+ (__m512d)__builtin_ia32_subpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C)
+
+#define _mm512_mask_sub_round_pd(W, U, A, B, C) \
+ (__m512d)__builtin_ia32_subpd512_mask(A, B, W, U, C)
+
+#define _mm512_maskz_sub_round_pd(U, A, B, C) \
+ (__m512d)__builtin_ia32_subpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C)
+
+#define _mm512_sub_round_ps(A, B, C) \
+ (__m512)__builtin_ia32_subps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C)
+
+#define _mm512_mask_sub_round_ps(W, U, A, B, C) \
+ (__m512)__builtin_ia32_subps512_mask(A, B, W, U, C)
+
+#define _mm512_maskz_sub_round_ps(U, A, B, C) \
+ (__m512)__builtin_ia32_subps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C)
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mul_round_pd (__m512d __A, __m512d __B, const int __R)
+{
+ return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mul_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
+ __m512d __B, const int __R)
+{
+ return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mul_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+ const int __R)
+{
+ return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mul_round_ps (__m512 __A, __m512 __B, const int __R)
+{
+ return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mul_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
+ __m512 __B, const int __R)
+{
+ return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __W,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mul_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R)
+{
+ return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_div_round_pd (__m512d __M, __m512d __V, const int __R)
+{
+ return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M,
+ (__v8df) __V,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_div_round_pd (__m512d __W, __mmask8 __U, __m512d __M,
+ __m512d __V, const int __R)
+{
+ return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M,
+ (__v8df) __V,
+ (__v8df) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_div_round_pd (__mmask8 __U, __m512d __M, __m512d __V,
+ const int __R)
+{
+ return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M,
+ (__v8df) __V,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_div_round_ps (__m512 __A, __m512 __B, const int __R)
+{
+ return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_div_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
+ __m512 __B, const int __R)
+{
+ return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __W,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_div_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R)
+{
+ return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_round_sd (__m128d __A, __m128d __B, const int __R)
+{
+ return (__m128d) __builtin_ia32_mulsd_round ((__v2df) __A,
+ (__v2df) __B,
+ __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mul_round_sd (__m128d __W, __mmask8 __U, __m128d __A,
+ __m128d __B, const int __R)
+{
+ return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mul_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
+ const int __R)
+{
+ return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_round_ss (__m128 __A, __m128 __B, const int __R)
+{
+ return (__m128) __builtin_ia32_mulss_round ((__v4sf) __A,
+ (__v4sf) __B,
+ __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mul_round_ss (__m128 __W, __mmask8 __U, __m128 __A,
+ __m128 __B, const int __R)
+{
+ return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mul_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
+ const int __R)
+{
+ return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_round_sd (__m128d __A, __m128d __B, const int __R)
+{
+ return (__m128d) __builtin_ia32_divsd_round ((__v2df) __A,
+ (__v2df) __B,
+ __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_div_round_sd (__m128d __W, __mmask8 __U, __m128d __A,
+ __m128d __B, const int __R)
+{
+ return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_div_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
+ const int __R)
+{
+ return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_round_ss (__m128 __A, __m128 __B, const int __R)
+{
+ return (__m128) __builtin_ia32_divss_round ((__v4sf) __A,
+ (__v4sf) __B,
+ __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_div_round_ss (__m128 __W, __mmask8 __U, __m128 __A,
+ __m128 __B, const int __R)
+{
+ return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_div_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
+ const int __R)
+{
+ return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U, __R);
+}
+
+#else
+#define _mm512_mul_round_pd(A, B, C) \
+ (__m512d)__builtin_ia32_mulpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C)
+
+#define _mm512_mask_mul_round_pd(W, U, A, B, C) \
+ (__m512d)__builtin_ia32_mulpd512_mask(A, B, W, U, C)
+
+#define _mm512_maskz_mul_round_pd(U, A, B, C) \
+ (__m512d)__builtin_ia32_mulpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C)
+
+#define _mm512_mul_round_ps(A, B, C) \
+ (__m512)__builtin_ia32_mulps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C)
+
+#define _mm512_mask_mul_round_ps(W, U, A, B, C) \
+ (__m512)__builtin_ia32_mulps512_mask(A, B, W, U, C)
+
+#define _mm512_maskz_mul_round_ps(U, A, B, C) \
+ (__m512)__builtin_ia32_mulps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C)
+
+#define _mm512_div_round_pd(A, B, C) \
+ (__m512d)__builtin_ia32_divpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C)
+
+#define _mm512_mask_div_round_pd(W, U, A, B, C) \
+ (__m512d)__builtin_ia32_divpd512_mask(A, B, W, U, C)
+
+#define _mm512_maskz_div_round_pd(U, A, B, C) \
+ (__m512d)__builtin_ia32_divpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C)
+
+#define _mm512_div_round_ps(A, B, C) \
+ (__m512)__builtin_ia32_divps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C)
+
+#define _mm512_mask_div_round_ps(W, U, A, B, C) \
+ (__m512)__builtin_ia32_divps512_mask(A, B, W, U, C)
+
+#define _mm512_maskz_div_round_ps(U, A, B, C) \
+ (__m512)__builtin_ia32_divps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C)
+
+#define _mm_mul_round_sd(A, B, C) \
+ (__m128d)__builtin_ia32_mulsd_round(A, B, C)
+
+#define _mm_mask_mul_round_sd(W, U, A, B, C) \
+ (__m128d)__builtin_ia32_mulsd_mask_round(A, B, W, U, C)
+
+#define _mm_maskz_mul_round_sd(U, A, B, C) \
+ (__m128d)__builtin_ia32_mulsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C)
+
+#define _mm_mul_round_ss(A, B, C) \
+ (__m128)__builtin_ia32_mulss_round(A, B, C)
+
+#define _mm_mask_mul_round_ss(W, U, A, B, C) \
+ (__m128)__builtin_ia32_mulss_mask_round(A, B, W, U, C)
+
+#define _mm_maskz_mul_round_ss(U, A, B, C) \
+ (__m128)__builtin_ia32_mulss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C)
+
+#define _mm_div_round_sd(A, B, C) \
+ (__m128d)__builtin_ia32_divsd_round(A, B, C)
+
+#define _mm_mask_div_round_sd(W, U, A, B, C) \
+ (__m128d)__builtin_ia32_divsd_mask_round(A, B, W, U, C)
+
+#define _mm_maskz_div_round_sd(U, A, B, C) \
+ (__m128d)__builtin_ia32_divsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C)
+
+#define _mm_div_round_ss(A, B, C) \
+ (__m128)__builtin_ia32_divss_round(A, B, C)
+
+#define _mm_mask_div_round_ss(W, U, A, B, C) \
+ (__m128)__builtin_ia32_divss_mask_round(A, B, W, U, C)
+
+#define _mm_maskz_div_round_ss(U, A, B, C) \
+ (__m128)__builtin_ia32_divss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C)
+
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_round_pd (__m512d __A, __m512d __B, const int __R)
+{
+ return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
+ __m512d __B, const int __R)
+{
+ return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+ const int __R)
+{
+ return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_round_ps (__m512 __A, __m512 __B, const int __R)
+{
+ return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
+ __m512 __B, const int __R)
+{
+ return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __W,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R)
+{
+ return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_round_pd (__m512d __A, __m512d __B, const int __R)
+{
+ return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
+ __m512d __B, const int __R)
+{
+ return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+ const int __R)
+{
+ return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_round_ps (__m512 __A, __m512 __B, const int __R)
+{
+ return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
+ __m512 __B, const int __R)
+{
+ return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __W,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R)
+{
+ return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U, __R);
+}
+#else
+#define _mm512_max_round_pd(A, B, R) \
+ (__m512d)__builtin_ia32_maxpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, R)
+
+#define _mm512_mask_max_round_pd(W, U, A, B, R) \
+ (__m512d)__builtin_ia32_maxpd512_mask(A, B, W, U, R)
+
+#define _mm512_maskz_max_round_pd(U, A, B, R) \
+ (__m512d)__builtin_ia32_maxpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, R)
+
+#define _mm512_max_round_ps(A, B, R) \
+ (__m512)__builtin_ia32_maxps512_mask(A, B, (__v16sf)_mm512_undefined_pd(), -1, R)
+
+#define _mm512_mask_max_round_ps(W, U, A, B, R) \
+ (__m512)__builtin_ia32_maxps512_mask(A, B, W, U, R)
+
+#define _mm512_maskz_max_round_ps(U, A, B, R) \
+ (__m512)__builtin_ia32_maxps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, R)
+
+#define _mm512_min_round_pd(A, B, R) \
+ (__m512d)__builtin_ia32_minpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, R)
+
+#define _mm512_mask_min_round_pd(W, U, A, B, R) \
+ (__m512d)__builtin_ia32_minpd512_mask(A, B, W, U, R)
+
+#define _mm512_maskz_min_round_pd(U, A, B, R) \
+ (__m512d)__builtin_ia32_minpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, R)
+
+#define _mm512_min_round_ps(A, B, R) \
+ (__m512)__builtin_ia32_minps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, R)
+
+#define _mm512_mask_min_round_ps(W, U, A, B, R) \
+ (__m512)__builtin_ia32_minps512_mask(A, B, W, U, R)
+
+#define _mm512_maskz_min_round_ps(U, A, B, R) \
+ (__m512)__builtin_ia32_minps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, R)
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_scalef_round_pd (__m512d __A, __m512d __B, const int __R)
+{
+ return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_scalef_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
+ __m512d __B, const int __R)
+{
+ return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_scalef_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+ const int __R)
+{
+ return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_scalef_round_ps (__m512 __A, __m512 __B, const int __R)
+{
+ return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_scalef_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
+ __m512 __B, const int __R)
+{
+ return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __W,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_scalef_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
+ const int __R)
+{
+ return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_scalef_round_sd (__m128d __A, __m128d __B, const int __R)
+{
+ return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_scalef_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B,
+ const int __R)
+{
+ return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_scalef_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
+ const int __R)
+{
+ return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_scalef_round_ss (__m128 __A, __m128 __B, const int __R)
+{
+ return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_scalef_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B,
+ const int __R)
+{
+ return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_scalef_round_ss (__mmask8 __U, __m128 __A, __m128 __B, const int __R)
+{
+ return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U, __R);
+}
+#else
+#define _mm512_scalef_round_pd(A, B, C) \
+ ((__m512d) \
+ __builtin_ia32_scalefpd512_mask((A), (B), \
+ (__v8df) _mm512_undefined_pd(), \
+ -1, (C)))
+
+#define _mm512_mask_scalef_round_pd(W, U, A, B, C) \
+ ((__m512d) __builtin_ia32_scalefpd512_mask((A), (B), (W), (U), (C)))
+
+#define _mm512_maskz_scalef_round_pd(U, A, B, C) \
+ ((__m512d) \
+ __builtin_ia32_scalefpd512_mask((A), (B), \
+ (__v8df) _mm512_setzero_pd(), \
+ (U), (C)))
+
+#define _mm512_scalef_round_ps(A, B, C) \
+ ((__m512) \
+ __builtin_ia32_scalefps512_mask((A), (B), \
+ (__v16sf) _mm512_undefined_ps(), \
+ -1, (C)))
+
+#define _mm512_mask_scalef_round_ps(W, U, A, B, C) \
+ ((__m512) __builtin_ia32_scalefps512_mask((A), (B), (W), (U), (C)))
+
+#define _mm512_maskz_scalef_round_ps(U, A, B, C) \
+ ((__m512) \
+ __builtin_ia32_scalefps512_mask((A), (B), \
+ (__v16sf) _mm512_setzero_ps(), \
+ (U), (C)))
+
+#define _mm_scalef_round_sd(A, B, C) \
+ ((__m128d) \
+ __builtin_ia32_scalefsd_mask_round ((A), (B), \
+ (__v2df) _mm_undefined_pd (), \
+ -1, (C)))
+
+#define _mm_scalef_round_ss(A, B, C) \
+ ((__m128) \
+ __builtin_ia32_scalefss_mask_round ((A), (B), \
+ (__v4sf) _mm_undefined_ps (), \
+ -1, (C)))
+
+#define _mm_mask_scalef_round_sd(W, U, A, B, C) \
+ ((__m128d) \
+ __builtin_ia32_scalefsd_mask_round ((A), (B), (W), (U), (C)))
+
+#define _mm_mask_scalef_round_ss(W, U, A, B, C) \
+ ((__m128) \
+ __builtin_ia32_scalefss_mask_round ((A), (B), (W), (U), (C)))
+
+#define _mm_maskz_scalef_round_sd(U, A, B, C) \
+ ((__m128d) \
+ __builtin_ia32_scalefsd_mask_round ((A), (B), \
+ (__v2df) _mm_setzero_pd (), \
+ (U), (C)))
+
+#define _mm_maskz_scalef_round_ss(U, A, B, C) \
+ ((__m128) \
+ __builtin_ia32_scalefss_mask_round ((A), (B), \
+ (__v4sf) _mm_setzero_ps (), \
+ (U), (C)))
+#endif
+
+#define _mm_mask_scalef_sd(W, U, A, B) \
+ _mm_mask_scalef_round_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_scalef_sd(U, A, B) \
+ _mm_maskz_scalef_round_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_scalef_ss(W, U, A, B) \
+ _mm_mask_scalef_round_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_scalef_ss(U, A, B) \
+ _mm_maskz_scalef_round_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmadd_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R)
+{
+ return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmadd_round_pd (__m512d __A, __mmask8 __U, __m512d __B,
+ __m512d __C, const int __R)
+{
+ return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmadd_round_pd (__m512d __A, __m512d __B, __m512d __C,
+ __mmask8 __U, const int __R)
+{
+ return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmadd_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+ __m512d __C, const int __R)
+{
+ return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmadd_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R)
+{
+ return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmadd_round_ps (__m512 __A, __mmask16 __U, __m512 __B,
+ __m512 __C, const int __R)
+{
+ return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmadd_round_ps (__m512 __A, __m512 __B, __m512 __C,
+ __mmask16 __U, const int __R)
+{
+ return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmadd_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
+ __m512 __C, const int __R)
+{
+ return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmsub_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R)
+{
+ return (__m512d) __builtin_ia32_vfmsubpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmsub_round_pd (__m512d __A, __mmask8 __U, __m512d __B,
+ __m512d __C, const int __R)
+{
+ return (__m512d) __builtin_ia32_vfmsubpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmsub_round_pd (__m512d __A, __m512d __B, __m512d __C,
+ __mmask8 __U, const int __R)
+{
+ return (__m512d) __builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmsub_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+ __m512d __C, const int __R)
+{
+ return (__m512d) __builtin_ia32_vfmsubpd512_maskz ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmsub_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R)
+{
+ return (__m512) __builtin_ia32_vfmsubps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmsub_round_ps (__m512 __A, __mmask16 __U, __m512 __B,
+ __m512 __C, const int __R)
+{
+ return (__m512) __builtin_ia32_vfmsubps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmsub_round_ps (__m512 __A, __m512 __B, __m512 __C,
+ __mmask16 __U, const int __R)
+{
+ return (__m512) __builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmsub_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
+ __m512 __C, const int __R)
+{
+ return (__m512) __builtin_ia32_vfmsubps512_maskz ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmaddsub_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R)
+{
+ return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmaddsub_round_pd (__m512d __A, __mmask8 __U, __m512d __B,
+ __m512d __C, const int __R)
+{
+ return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmaddsub_round_pd (__m512d __A, __m512d __B, __m512d __C,
+ __mmask8 __U, const int __R)
+{
+ return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmaddsub_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+ __m512d __C, const int __R)
+{
+ return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmaddsub_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R)
+{
+ return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmaddsub_round_ps (__m512 __A, __mmask16 __U, __m512 __B,
+ __m512 __C, const int __R)
+{
+ return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmaddsub_round_ps (__m512 __A, __m512 __B, __m512 __C,
+ __mmask16 __U, const int __R)
+{
+ return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmaddsub_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
+ __m512 __C, const int __R)
+{
+ return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmsubadd_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R)
+{
+ return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ -(__v8df) __C,
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmsubadd_round_pd (__m512d __A, __mmask8 __U, __m512d __B,
+ __m512d __C, const int __R)
+{
+ return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ -(__v8df) __C,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmsubadd_round_pd (__m512d __A, __m512d __B, __m512d __C,
+ __mmask8 __U, const int __R)
+{
+ return (__m512d) __builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmsubadd_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+ __m512d __C, const int __R)
+{
+ return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
+ (__v8df) __B,
+ -(__v8df) __C,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmsubadd_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R)
+{
+ return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ -(__v16sf) __C,
+ (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmsubadd_round_ps (__m512 __A, __mmask16 __U, __m512 __B,
+ __m512 __C, const int __R)
+{
+ return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ -(__v16sf) __C,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmsubadd_round_ps (__m512 __A, __m512 __B, __m512 __C,
+ __mmask16 __U, const int __R)
+{
+ return (__m512) __builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmsubadd_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
+ __m512 __C, const int __R)
+{
+ return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
+ (__v16sf) __B,
+ -(__v16sf) __C,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fnmadd_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R)
+{
+ return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fnmadd_round_pd (__m512d __A, __mmask8 __U, __m512d __B,
+ __m512d __C, const int __R)
+{
+ return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fnmadd_round_pd (__m512d __A, __m512d __B, __m512d __C,
+ __mmask8 __U, const int __R)
+{
+ return (__m512d) __builtin_ia32_vfnmaddpd512_mask3 ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fnmadd_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+ __m512d __C, const int __R)
+{
+ return (__m512d) __builtin_ia32_vfnmaddpd512_maskz ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fnmadd_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R)
+{
+ return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fnmadd_round_ps (__m512 __A, __mmask16 __U, __m512 __B,
+ __m512 __C, const int __R)
+{
+ return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fnmadd_round_ps (__m512 __A, __m512 __B, __m512 __C,
+ __mmask16 __U, const int __R)
+{
+ return (__m512) __builtin_ia32_vfnmaddps512_mask3 ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fnmadd_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
+ __m512 __C, const int __R)
+{
+ return (__m512) __builtin_ia32_vfnmaddps512_maskz ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fnmsub_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R)
+{
+ return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fnmsub_round_pd (__m512d __A, __mmask8 __U, __m512d __B,
+ __m512d __C, const int __R)
+{
+ return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fnmsub_round_pd (__m512d __A, __m512d __B, __m512d __C,
+ __mmask8 __U, const int __R)
+{
+ return (__m512d) __builtin_ia32_vfnmsubpd512_mask3 ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fnmsub_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+ __m512d __C, const int __R)
+{
+ return (__m512d) __builtin_ia32_vfnmsubpd512_maskz ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fnmsub_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R)
+{
+ return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fnmsub_round_ps (__m512 __A, __mmask16 __U, __m512 __B,
+ __m512 __C, const int __R)
+{
+ return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fnmsub_round_ps (__m512 __A, __m512 __B, __m512 __C,
+ __mmask16 __U, const int __R)
+{
+ return (__m512) __builtin_ia32_vfnmsubps512_mask3 ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fnmsub_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
+ __m512 __C, const int __R)
+{
+ return (__m512) __builtin_ia32_vfnmsubps512_maskz ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U, __R);
+}
+#else
+#define _mm512_fmadd_round_pd(A, B, C, R) \
+ (__m512d)__builtin_ia32_vfmaddpd512_mask(A, B, C, -1, R)
+
+#define _mm512_mask_fmadd_round_pd(A, U, B, C, R) \
+ (__m512d)__builtin_ia32_vfmaddpd512_mask(A, B, C, U, R)
+
+#define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) \
+ (__m512d)__builtin_ia32_vfmaddpd512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) \
+ (__m512d)__builtin_ia32_vfmaddpd512_maskz(A, B, C, U, R)
+
+#define _mm512_fmadd_round_ps(A, B, C, R) \
+ (__m512)__builtin_ia32_vfmaddps512_mask(A, B, C, -1, R)
+
+#define _mm512_mask_fmadd_round_ps(A, U, B, C, R) \
+ (__m512)__builtin_ia32_vfmaddps512_mask(A, B, C, U, R)
+
+#define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) \
+ (__m512)__builtin_ia32_vfmaddps512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) \
+ (__m512)__builtin_ia32_vfmaddps512_maskz(A, B, C, U, R)
+
+#define _mm512_fmsub_round_pd(A, B, C, R) \
+ (__m512d)__builtin_ia32_vfmsubpd512_mask(A, B, C, -1, R)
+
+#define _mm512_mask_fmsub_round_pd(A, U, B, C, R) \
+ (__m512d)__builtin_ia32_vfmsubpd512_mask(A, B, C, U, R)
+
+#define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) \
+ (__m512d)__builtin_ia32_vfmsubpd512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) \
+ (__m512d)__builtin_ia32_vfmsubpd512_maskz(A, B, C, U, R)
+
+#define _mm512_fmsub_round_ps(A, B, C, R) \
+ (__m512)__builtin_ia32_vfmsubps512_mask(A, B, C, -1, R)
+
+#define _mm512_mask_fmsub_round_ps(A, U, B, C, R) \
+ (__m512)__builtin_ia32_vfmsubps512_mask(A, B, C, U, R)
+
+#define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) \
+ (__m512)__builtin_ia32_vfmsubps512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) \
+ (__m512)__builtin_ia32_vfmsubps512_maskz(A, B, C, U, R)
+
+#define _mm512_fmaddsub_round_pd(A, B, C, R) \
+ (__m512d)__builtin_ia32_vfmaddsubpd512_mask(A, B, C, -1, R)
+
+#define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) \
+ (__m512d)__builtin_ia32_vfmaddsubpd512_mask(A, B, C, U, R)
+
+#define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) \
+ (__m512d)__builtin_ia32_vfmaddsubpd512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) \
+ (__m512d)__builtin_ia32_vfmaddsubpd512_maskz(A, B, C, U, R)
+
+#define _mm512_fmaddsub_round_ps(A, B, C, R) \
+ (__m512)__builtin_ia32_vfmaddsubps512_mask(A, B, C, -1, R)
+
+#define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) \
+ (__m512)__builtin_ia32_vfmaddsubps512_mask(A, B, C, U, R)
+
+#define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) \
+ (__m512)__builtin_ia32_vfmaddsubps512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) \
+ (__m512)__builtin_ia32_vfmaddsubps512_maskz(A, B, C, U, R)
+
+#define _mm512_fmsubadd_round_pd(A, B, C, R) \
+ (__m512d)__builtin_ia32_vfmaddsubpd512_mask(A, B, -(C), -1, R)
+
+#define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) \
+ (__m512d)__builtin_ia32_vfmaddsubpd512_mask(A, B, -(C), U, R)
+
+#define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) \
+ (__m512d)__builtin_ia32_vfmsubaddpd512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) \
+ (__m512d)__builtin_ia32_vfmaddsubpd512_maskz(A, B, -(C), U, R)
+
+#define _mm512_fmsubadd_round_ps(A, B, C, R) \
+ (__m512)__builtin_ia32_vfmaddsubps512_mask(A, B, -(C), -1, R)
+
+#define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) \
+ (__m512)__builtin_ia32_vfmaddsubps512_mask(A, B, -(C), U, R)
+
+#define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) \
+ (__m512)__builtin_ia32_vfmsubaddps512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) \
+ (__m512)__builtin_ia32_vfmaddsubps512_maskz(A, B, -(C), U, R)
+
+#define _mm512_fnmadd_round_pd(A, B, C, R) \
+ (__m512d)__builtin_ia32_vfnmaddpd512_mask(A, B, C, -1, R)
+
+#define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) \
+ (__m512d)__builtin_ia32_vfnmaddpd512_mask(A, B, C, U, R)
+
+#define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) \
+ (__m512d)__builtin_ia32_vfnmaddpd512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) \
+ (__m512d)__builtin_ia32_vfnmaddpd512_maskz(A, B, C, U, R)
+
+#define _mm512_fnmadd_round_ps(A, B, C, R) \
+ (__m512)__builtin_ia32_vfnmaddps512_mask(A, B, C, -1, R)
+
+#define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) \
+ (__m512)__builtin_ia32_vfnmaddps512_mask(A, B, C, U, R)
+
+#define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) \
+ (__m512)__builtin_ia32_vfnmaddps512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) \
+ (__m512)__builtin_ia32_vfnmaddps512_maskz(A, B, C, U, R)
+
+#define _mm512_fnmsub_round_pd(A, B, C, R) \
+ (__m512d)__builtin_ia32_vfnmsubpd512_mask(A, B, C, -1, R)
+
+#define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) \
+ (__m512d)__builtin_ia32_vfnmsubpd512_mask(A, B, C, U, R)
+
+#define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) \
+ (__m512d)__builtin_ia32_vfnmsubpd512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) \
+ (__m512d)__builtin_ia32_vfnmsubpd512_maskz(A, B, C, U, R)
+
+#define _mm512_fnmsub_round_ps(A, B, C, R) \
+ (__m512)__builtin_ia32_vfnmsubps512_mask(A, B, C, -1, R)
+
+#define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) \
+ (__m512)__builtin_ia32_vfnmsubps512_mask(A, B, C, U, R)
+
+#define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) \
+ (__m512)__builtin_ia32_vfnmsubps512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) \
+ (__m512)__builtin_ia32_vfnmsubps512_maskz(A, B, C, U, R)
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_abs_epi64 (__m512i __A)
+{
+ return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_abs_epi32 (__m512i __A)
+{
+ return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcastss_ps (__m128 __A)
+{
+ return (__m512) __builtin_ia32_broadcastss512 ((__v4sf) __A,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A)
+{
+ return (__m512) __builtin_ia32_broadcastss512 ((__v4sf) __A,
+ (__v16sf) __O, __M);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A)
+{
+ return (__m512) __builtin_ia32_broadcastss512 ((__v4sf) __A,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ __M);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcastsd_pd (__m128d __A)
+{
+ return (__m512d) __builtin_ia32_broadcastsd512 ((__v2df) __A,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A)
+{
+ return (__m512d) __builtin_ia32_broadcastsd512 ((__v2df) __A,
+ (__v8df) __O, __M);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
+{
+ return (__m512d) __builtin_ia32_broadcastsd512 ((__v2df) __A,
+ (__v8df)
+ _mm512_setzero_pd (),
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcastd_epi32 (__m128i __A)
+{
+ return (__m512i) __builtin_ia32_pbroadcastd512 ((__v4si) __A,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A)
+{
+ return (__m512i) __builtin_ia32_pbroadcastd512 ((__v4si) __A,
+ (__v16si) __O, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A)
+{
+ return (__m512i) __builtin_ia32_pbroadcastd512 ((__v4si) __A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set1_epi32 (int __A)
+{
+ return (__m512i)(__v16si)
+ { __A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A };
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
+{
+ return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A, (__v16si) __O,
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_set1_epi32 (__mmask16 __M, int __A)
+{
+ return (__m512i)
+ __builtin_ia32_pbroadcastd512_gpr_mask (__A,
+ (__v16si) _mm512_setzero_si512 (),
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcastq_epi64 (__m128i __A)
+{
+ return (__m512i) __builtin_ia32_pbroadcastq512 ((__v2di) __A,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m512i) __builtin_ia32_pbroadcastq512 ((__v2di) __A,
+ (__v8di) __O, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
+{
+ return (__m512i) __builtin_ia32_pbroadcastq512 ((__v2di) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set1_epi64 (long long __A)
+{
+ return (__m512i)(__v8di) { __A, __A, __A, __A, __A, __A, __A, __A };
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
+{
+ return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A, (__v8di) __O,
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_set1_epi64 (__mmask8 __M, long long __A)
+{
+ return (__m512i)
+ __builtin_ia32_pbroadcastq512_gpr_mask (__A,
+ (__v8di) _mm512_setzero_si512 (),
+ __M);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcast_f32x4 (__m128 __A)
+{
+ return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcast_f32x4 (__m512 __O, __mmask16 __M, __m128 __A)
+{
+ return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
+ (__v16sf) __O,
+ __M);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcast_f32x4 (__mmask16 __M, __m128 __A)
+{
+ return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcast_i32x4 (__m128i __A)
+{
+ return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcast_i32x4 (__m512i __O, __mmask16 __M, __m128i __A)
+{
+ return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
+ (__v16si) __O,
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcast_i32x4 (__mmask16 __M, __m128i __A)
+{
+ return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ __M);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcast_f64x4 (__m256d __A)
+{
+ return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcast_f64x4 (__m512d __O, __mmask8 __M, __m256d __A)
+{
+ return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
+ (__v8df) __O,
+ __M);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcast_f64x4 (__mmask8 __M, __m256d __A)
+{
+ return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
+ (__v8df)
+ _mm512_setzero_pd (),
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcast_i64x4 (__m256i __A)
+{
+ return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcast_i64x4 (__m512i __O, __mmask8 __M, __m256i __A)
+{
+ return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
+ (__v8di) __O,
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcast_i64x4 (__mmask8 __M, __m256i __A)
+{
+ return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ __M);
+}
+
+typedef enum
+{
+ _MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02,
+ _MM_PERM_AAAD = 0x03, _MM_PERM_AABA = 0x04, _MM_PERM_AABB = 0x05,
+ _MM_PERM_AABC = 0x06, _MM_PERM_AABD = 0x07, _MM_PERM_AACA = 0x08,
+ _MM_PERM_AACB = 0x09, _MM_PERM_AACC = 0x0A, _MM_PERM_AACD = 0x0B,
+ _MM_PERM_AADA = 0x0C, _MM_PERM_AADB = 0x0D, _MM_PERM_AADC = 0x0E,
+ _MM_PERM_AADD = 0x0F, _MM_PERM_ABAA = 0x10, _MM_PERM_ABAB = 0x11,
+ _MM_PERM_ABAC = 0x12, _MM_PERM_ABAD = 0x13, _MM_PERM_ABBA = 0x14,
+ _MM_PERM_ABBB = 0x15, _MM_PERM_ABBC = 0x16, _MM_PERM_ABBD = 0x17,
+ _MM_PERM_ABCA = 0x18, _MM_PERM_ABCB = 0x19, _MM_PERM_ABCC = 0x1A,
+ _MM_PERM_ABCD = 0x1B, _MM_PERM_ABDA = 0x1C, _MM_PERM_ABDB = 0x1D,
+ _MM_PERM_ABDC = 0x1E, _MM_PERM_ABDD = 0x1F, _MM_PERM_ACAA = 0x20,
+ _MM_PERM_ACAB = 0x21, _MM_PERM_ACAC = 0x22, _MM_PERM_ACAD = 0x23,
+ _MM_PERM_ACBA = 0x24, _MM_PERM_ACBB = 0x25, _MM_PERM_ACBC = 0x26,
+ _MM_PERM_ACBD = 0x27, _MM_PERM_ACCA = 0x28, _MM_PERM_ACCB = 0x29,
+ _MM_PERM_ACCC = 0x2A, _MM_PERM_ACCD = 0x2B, _MM_PERM_ACDA = 0x2C,
+ _MM_PERM_ACDB = 0x2D, _MM_PERM_ACDC = 0x2E, _MM_PERM_ACDD = 0x2F,
+ _MM_PERM_ADAA = 0x30, _MM_PERM_ADAB = 0x31, _MM_PERM_ADAC = 0x32,
+ _MM_PERM_ADAD = 0x33, _MM_PERM_ADBA = 0x34, _MM_PERM_ADBB = 0x35,
+ _MM_PERM_ADBC = 0x36, _MM_PERM_ADBD = 0x37, _MM_PERM_ADCA = 0x38,
+ _MM_PERM_ADCB = 0x39, _MM_PERM_ADCC = 0x3A, _MM_PERM_ADCD = 0x3B,
+ _MM_PERM_ADDA = 0x3C, _MM_PERM_ADDB = 0x3D, _MM_PERM_ADDC = 0x3E,
+ _MM_PERM_ADDD = 0x3F, _MM_PERM_BAAA = 0x40, _MM_PERM_BAAB = 0x41,
+ _MM_PERM_BAAC = 0x42, _MM_PERM_BAAD = 0x43, _MM_PERM_BABA = 0x44,
+ _MM_PERM_BABB = 0x45, _MM_PERM_BABC = 0x46, _MM_PERM_BABD = 0x47,
+ _MM_PERM_BACA = 0x48, _MM_PERM_BACB = 0x49, _MM_PERM_BACC = 0x4A,
+ _MM_PERM_BACD = 0x4B, _MM_PERM_BADA = 0x4C, _MM_PERM_BADB = 0x4D,
+ _MM_PERM_BADC = 0x4E, _MM_PERM_BADD = 0x4F, _MM_PERM_BBAA = 0x50,
+ _MM_PERM_BBAB = 0x51, _MM_PERM_BBAC = 0x52, _MM_PERM_BBAD = 0x53,
+ _MM_PERM_BBBA = 0x54, _MM_PERM_BBBB = 0x55, _MM_PERM_BBBC = 0x56,
+ _MM_PERM_BBBD = 0x57, _MM_PERM_BBCA = 0x58, _MM_PERM_BBCB = 0x59,
+ _MM_PERM_BBCC = 0x5A, _MM_PERM_BBCD = 0x5B, _MM_PERM_BBDA = 0x5C,
+ _MM_PERM_BBDB = 0x5D, _MM_PERM_BBDC = 0x5E, _MM_PERM_BBDD = 0x5F,
+ _MM_PERM_BCAA = 0x60, _MM_PERM_BCAB = 0x61, _MM_PERM_BCAC = 0x62,
+ _MM_PERM_BCAD = 0x63, _MM_PERM_BCBA = 0x64, _MM_PERM_BCBB = 0x65,
+ _MM_PERM_BCBC = 0x66, _MM_PERM_BCBD = 0x67, _MM_PERM_BCCA = 0x68,
+ _MM_PERM_BCCB = 0x69, _MM_PERM_BCCC = 0x6A, _MM_PERM_BCCD = 0x6B,
+ _MM_PERM_BCDA = 0x6C, _MM_PERM_BCDB = 0x6D, _MM_PERM_BCDC = 0x6E,
+ _MM_PERM_BCDD = 0x6F, _MM_PERM_BDAA = 0x70, _MM_PERM_BDAB = 0x71,
+ _MM_PERM_BDAC = 0x72, _MM_PERM_BDAD = 0x73, _MM_PERM_BDBA = 0x74,
+ _MM_PERM_BDBB = 0x75, _MM_PERM_BDBC = 0x76, _MM_PERM_BDBD = 0x77,
+ _MM_PERM_BDCA = 0x78, _MM_PERM_BDCB = 0x79, _MM_PERM_BDCC = 0x7A,
+ _MM_PERM_BDCD = 0x7B, _MM_PERM_BDDA = 0x7C, _MM_PERM_BDDB = 0x7D,
+ _MM_PERM_BDDC = 0x7E, _MM_PERM_BDDD = 0x7F, _MM_PERM_CAAA = 0x80,
+ _MM_PERM_CAAB = 0x81, _MM_PERM_CAAC = 0x82, _MM_PERM_CAAD = 0x83,
+ _MM_PERM_CABA = 0x84, _MM_PERM_CABB = 0x85, _MM_PERM_CABC = 0x86,
+ _MM_PERM_CABD = 0x87, _MM_PERM_CACA = 0x88, _MM_PERM_CACB = 0x89,
+ _MM_PERM_CACC = 0x8A, _MM_PERM_CACD = 0x8B, _MM_PERM_CADA = 0x8C,
+ _MM_PERM_CADB = 0x8D, _MM_PERM_CADC = 0x8E, _MM_PERM_CADD = 0x8F,
+ _MM_PERM_CBAA = 0x90, _MM_PERM_CBAB = 0x91, _MM_PERM_CBAC = 0x92,
+ _MM_PERM_CBAD = 0x93, _MM_PERM_CBBA = 0x94, _MM_PERM_CBBB = 0x95,
+ _MM_PERM_CBBC = 0x96, _MM_PERM_CBBD = 0x97, _MM_PERM_CBCA = 0x98,
+ _MM_PERM_CBCB = 0x99, _MM_PERM_CBCC = 0x9A, _MM_PERM_CBCD = 0x9B,
+ _MM_PERM_CBDA = 0x9C, _MM_PERM_CBDB = 0x9D, _MM_PERM_CBDC = 0x9E,
+ _MM_PERM_CBDD = 0x9F, _MM_PERM_CCAA = 0xA0, _MM_PERM_CCAB = 0xA1,
+ _MM_PERM_CCAC = 0xA2, _MM_PERM_CCAD = 0xA3, _MM_PERM_CCBA = 0xA4,
+ _MM_PERM_CCBB = 0xA5, _MM_PERM_CCBC = 0xA6, _MM_PERM_CCBD = 0xA7,
+ _MM_PERM_CCCA = 0xA8, _MM_PERM_CCCB = 0xA9, _MM_PERM_CCCC = 0xAA,
+ _MM_PERM_CCCD = 0xAB, _MM_PERM_CCDA = 0xAC, _MM_PERM_CCDB = 0xAD,
+ _MM_PERM_CCDC = 0xAE, _MM_PERM_CCDD = 0xAF, _MM_PERM_CDAA = 0xB0,
+ _MM_PERM_CDAB = 0xB1, _MM_PERM_CDAC = 0xB2, _MM_PERM_CDAD = 0xB3,
+ _MM_PERM_CDBA = 0xB4, _MM_PERM_CDBB = 0xB5, _MM_PERM_CDBC = 0xB6,
+ _MM_PERM_CDBD = 0xB7, _MM_PERM_CDCA = 0xB8, _MM_PERM_CDCB = 0xB9,
+ _MM_PERM_CDCC = 0xBA, _MM_PERM_CDCD = 0xBB, _MM_PERM_CDDA = 0xBC,
+ _MM_PERM_CDDB = 0xBD, _MM_PERM_CDDC = 0xBE, _MM_PERM_CDDD = 0xBF,
+ _MM_PERM_DAAA = 0xC0, _MM_PERM_DAAB = 0xC1, _MM_PERM_DAAC = 0xC2,
+ _MM_PERM_DAAD = 0xC3, _MM_PERM_DABA = 0xC4, _MM_PERM_DABB = 0xC5,
+ _MM_PERM_DABC = 0xC6, _MM_PERM_DABD = 0xC7, _MM_PERM_DACA = 0xC8,
+ _MM_PERM_DACB = 0xC9, _MM_PERM_DACC = 0xCA, _MM_PERM_DACD = 0xCB,
+ _MM_PERM_DADA = 0xCC, _MM_PERM_DADB = 0xCD, _MM_PERM_DADC = 0xCE,
+ _MM_PERM_DADD = 0xCF, _MM_PERM_DBAA = 0xD0, _MM_PERM_DBAB = 0xD1,
+ _MM_PERM_DBAC = 0xD2, _MM_PERM_DBAD = 0xD3, _MM_PERM_DBBA = 0xD4,
+ _MM_PERM_DBBB = 0xD5, _MM_PERM_DBBC = 0xD6, _MM_PERM_DBBD = 0xD7,
+ _MM_PERM_DBCA = 0xD8, _MM_PERM_DBCB = 0xD9, _MM_PERM_DBCC = 0xDA,
+ _MM_PERM_DBCD = 0xDB, _MM_PERM_DBDA = 0xDC, _MM_PERM_DBDB = 0xDD,
+ _MM_PERM_DBDC = 0xDE, _MM_PERM_DBDD = 0xDF, _MM_PERM_DCAA = 0xE0,
+ _MM_PERM_DCAB = 0xE1, _MM_PERM_DCAC = 0xE2, _MM_PERM_DCAD = 0xE3,
+ _MM_PERM_DCBA = 0xE4, _MM_PERM_DCBB = 0xE5, _MM_PERM_DCBC = 0xE6,
+ _MM_PERM_DCBD = 0xE7, _MM_PERM_DCCA = 0xE8, _MM_PERM_DCCB = 0xE9,
+ _MM_PERM_DCCC = 0xEA, _MM_PERM_DCCD = 0xEB, _MM_PERM_DCDA = 0xEC,
+ _MM_PERM_DCDB = 0xED, _MM_PERM_DCDC = 0xEE, _MM_PERM_DCDD = 0xEF,
+ _MM_PERM_DDAA = 0xF0, _MM_PERM_DDAB = 0xF1, _MM_PERM_DDAC = 0xF2,
+ _MM_PERM_DDAD = 0xF3, _MM_PERM_DDBA = 0xF4, _MM_PERM_DDBB = 0xF5,
+ _MM_PERM_DDBC = 0xF6, _MM_PERM_DDBD = 0xF7, _MM_PERM_DDCA = 0xF8,
+ _MM_PERM_DDCB = 0xF9, _MM_PERM_DDCC = 0xFA, _MM_PERM_DDCD = 0xFB,
+ _MM_PERM_DDDA = 0xFC, _MM_PERM_DDDB = 0xFD, _MM_PERM_DDDC = 0xFE,
+ _MM_PERM_DDDD = 0xFF
+} _MM_PERM_ENUM;
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shuffle_epi32 (__m512i __A, _MM_PERM_ENUM __mask)
+{
+ return (__m512i) __builtin_ia32_pshufd512_mask ((__v16si) __A,
+ __mask,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shuffle_epi32 (__m512i __W, __mmask16 __U, __m512i __A,
+ _MM_PERM_ENUM __mask)
+{
+ return (__m512i) __builtin_ia32_pshufd512_mask ((__v16si) __A,
+ __mask,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shuffle_epi32 (__mmask16 __U, __m512i __A, _MM_PERM_ENUM __mask)
+{
+ return (__m512i) __builtin_ia32_pshufd512_mask ((__v16si) __A,
+ __mask,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shuffle_i64x2 (__m512i __A, __m512i __B, const int __imm)
+{
+ return (__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di) __A,
+ (__v8di) __B, __imm,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shuffle_i64x2 (__m512i __W, __mmask8 __U, __m512i __A,
+ __m512i __B, const int __imm)
+{
+ return (__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di) __A,
+ (__v8di) __B, __imm,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shuffle_i64x2 (__mmask8 __U, __m512i __A, __m512i __B,
+ const int __imm)
+{
+ return (__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di) __A,
+ (__v8di) __B, __imm,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shuffle_i32x4 (__m512i __A, __m512i __B, const int __imm)
+{
+ return (__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si) __A,
+ (__v16si) __B,
+ __imm,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shuffle_i32x4 (__m512i __W, __mmask16 __U, __m512i __A,
+ __m512i __B, const int __imm)
+{
+ return (__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si) __A,
+ (__v16si) __B,
+ __imm,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shuffle_i32x4 (__mmask16 __U, __m512i __A, __m512i __B,
+ const int __imm)
+{
+ return (__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si) __A,
+ (__v16si) __B,
+ __imm,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shuffle_f64x2 (__m512d __A, __m512d __B, const int __imm)
+{
+ return (__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df) __A,
+ (__v8df) __B, __imm,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shuffle_f64x2 (__m512d __W, __mmask8 __U, __m512d __A,
+ __m512d __B, const int __imm)
+{
+ return (__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df) __A,
+ (__v8df) __B, __imm,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shuffle_f64x2 (__mmask8 __U, __m512d __A, __m512d __B,
+ const int __imm)
+{
+ return (__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df) __A,
+ (__v8df) __B, __imm,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shuffle_f32x4 (__m512 __A, __m512 __B, const int __imm)
+{
+ return (__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf) __A,
+ (__v16sf) __B, __imm,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shuffle_f32x4 (__m512 __W, __mmask16 __U, __m512 __A,
+ __m512 __B, const int __imm)
+{
+ return (__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf) __A,
+ (__v16sf) __B, __imm,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shuffle_f32x4 (__mmask16 __U, __m512 __A, __m512 __B,
+ const int __imm)
+{
+ return (__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf) __A,
+ (__v16sf) __B, __imm,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+#else
+#define _mm512_shuffle_epi32(X, C) \
+ ((__m512i) __builtin_ia32_pshufd512_mask ((__v16si)(__m512i)(X), (int)(C),\
+ (__v16si)(__m512i)_mm512_undefined_epi32 (),\
+ (__mmask16)-1))
+
+#define _mm512_mask_shuffle_epi32(W, U, X, C) \
+ ((__m512i) __builtin_ia32_pshufd512_mask ((__v16si)(__m512i)(X), (int)(C),\
+ (__v16si)(__m512i)(W),\
+ (__mmask16)(U)))
+
+#define _mm512_maskz_shuffle_epi32(U, X, C) \
+ ((__m512i) __builtin_ia32_pshufd512_mask ((__v16si)(__m512i)(X), (int)(C),\
+ (__v16si)(__m512i)_mm512_setzero_si512 (),\
+ (__mmask16)(U)))
+
+#define _mm512_shuffle_i64x2(X, Y, C) \
+ ((__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(X), \
+ (__v8di)(__m512i)(Y), (int)(C),\
+ (__v8di)(__m512i)_mm512_undefined_epi32 (),\
+ (__mmask8)-1))
+
+#define _mm512_mask_shuffle_i64x2(W, U, X, Y, C) \
+ ((__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(X), \
+ (__v8di)(__m512i)(Y), (int)(C),\
+ (__v8di)(__m512i)(W),\
+ (__mmask8)(U)))
+
+#define _mm512_maskz_shuffle_i64x2(U, X, Y, C) \
+ ((__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(X), \
+ (__v8di)(__m512i)(Y), (int)(C),\
+ (__v8di)(__m512i)_mm512_setzero_si512 (),\
+ (__mmask8)(U)))
+
+#define _mm512_shuffle_i32x4(X, Y, C) \
+ ((__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si)(__m512i)(X), \
+ (__v16si)(__m512i)(Y), (int)(C),\
+ (__v16si)(__m512i)_mm512_undefined_epi32 (),\
+ (__mmask16)-1))
+
+#define _mm512_mask_shuffle_i32x4(W, U, X, Y, C) \
+ ((__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si)(__m512i)(X), \
+ (__v16si)(__m512i)(Y), (int)(C),\
+ (__v16si)(__m512i)(W),\
+ (__mmask16)(U)))
+
+#define _mm512_maskz_shuffle_i32x4(U, X, Y, C) \
+ ((__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si)(__m512i)(X), \
+ (__v16si)(__m512i)(Y), (int)(C),\
+ (__v16si)(__m512i)_mm512_setzero_si512 (),\
+ (__mmask16)(U)))
+
+#define _mm512_shuffle_f64x2(X, Y, C) \
+ ((__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df)(__m512d)(X), \
+ (__v8df)(__m512d)(Y), (int)(C),\
+ (__v8df)(__m512d)_mm512_undefined_pd(),\
+ (__mmask8)-1))
+
+#define _mm512_mask_shuffle_f64x2(W, U, X, Y, C) \
+ ((__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df)(__m512d)(X), \
+ (__v8df)(__m512d)(Y), (int)(C),\
+ (__v8df)(__m512d)(W),\
+ (__mmask8)(U)))
+
+#define _mm512_maskz_shuffle_f64x2(U, X, Y, C) \
+ ((__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df)(__m512d)(X), \
+ (__v8df)(__m512d)(Y), (int)(C),\
+ (__v8df)(__m512d)_mm512_setzero_pd(),\
+ (__mmask8)(U)))
+
+#define _mm512_shuffle_f32x4(X, Y, C) \
+ ((__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf)(__m512)(X), \
+ (__v16sf)(__m512)(Y), (int)(C),\
+ (__v16sf)(__m512)_mm512_undefined_ps(),\
+ (__mmask16)-1))
+
+#define _mm512_mask_shuffle_f32x4(W, U, X, Y, C) \
+ ((__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf)(__m512)(X), \
+ (__v16sf)(__m512)(Y), (int)(C),\
+ (__v16sf)(__m512)(W),\
+ (__mmask16)(U)))
+
+#define _mm512_maskz_shuffle_f32x4(U, X, Y, C) \
+ ((__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf)(__m512)(X), \
+ (__v16sf)(__m512)(Y), (int)(C),\
+ (__v16sf)(__m512)_mm512_setzero_ps(),\
+ (__mmask16)(U)))
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rolv_epi32 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rolv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rolv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rorv_epi32 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rorv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rorv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rolv_epi64 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rolv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rorv_epi64 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rorv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundpd_epi32 (__m512d __A, const int __R)
+{
+ return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
+ (__v8si)
+ _mm256_undefined_si256 (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A,
+ const int __R)
+{
+ return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
+ (__v8si) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundpd_epi32 (__mmask8 __U, __m512d __A, const int __R)
+{
+ return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundpd_epu32 (__m512d __A, const int __R)
+{
+ return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+ (__v8si)
+ _mm256_undefined_si256 (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A,
+ const int __R)
+{
+ return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+ (__v8si) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundpd_epu32 (__mmask8 __U, __m512d __A, const int __R)
+{
+ return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U, __R);
+}
+#else
+#define _mm512_cvtt_roundpd_epi32(A, B) \
+ ((__m256i)__builtin_ia32_cvttpd2dq512_mask(A, (__v8si)_mm256_undefined_si256(), -1, B))
+
+#define _mm512_mask_cvtt_roundpd_epi32(W, U, A, B) \
+ ((__m256i)__builtin_ia32_cvttpd2dq512_mask(A, (__v8si)(W), U, B))
+
+#define _mm512_maskz_cvtt_roundpd_epi32(U, A, B) \
+ ((__m256i)__builtin_ia32_cvttpd2dq512_mask(A, (__v8si)_mm256_setzero_si256(), U, B))
+
+#define _mm512_cvtt_roundpd_epu32(A, B) \
+ ((__m256i)__builtin_ia32_cvttpd2udq512_mask(A, (__v8si)_mm256_undefined_si256(), -1, B))
+
+#define _mm512_mask_cvtt_roundpd_epu32(W, U, A, B) \
+ ((__m256i)__builtin_ia32_cvttpd2udq512_mask(A, (__v8si)(W), U, B))
+
+#define _mm512_maskz_cvtt_roundpd_epu32(U, A, B) \
+ ((__m256i)__builtin_ia32_cvttpd2udq512_mask(A, (__v8si)_mm256_setzero_si256(), U, B))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundpd_epi32 (__m512d __A, const int __R)
+{
+ return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
+ (__v8si)
+ _mm256_undefined_si256 (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A,
+ const int __R)
+{
+ return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
+ (__v8si) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundpd_epi32 (__mmask8 __U, __m512d __A, const int __R)
+{
+ return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundpd_epu32 (__m512d __A, const int __R)
+{
+ return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
+ (__v8si)
+ _mm256_undefined_si256 (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A,
+ const int __R)
+{
+ return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
+ (__v8si) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundpd_epu32 (__mmask8 __U, __m512d __A, const int __R)
+{
+ return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U, __R);
+}
+#else
+#define _mm512_cvt_roundpd_epi32(A, B) \
+ ((__m256i)__builtin_ia32_cvtpd2dq512_mask(A, (__v8si)_mm256_undefined_si256(), -1, B))
+
+#define _mm512_mask_cvt_roundpd_epi32(W, U, A, B) \
+ ((__m256i)__builtin_ia32_cvtpd2dq512_mask(A, (__v8si)(W), U, B))
+
+#define _mm512_maskz_cvt_roundpd_epi32(U, A, B) \
+ ((__m256i)__builtin_ia32_cvtpd2dq512_mask(A, (__v8si)_mm256_setzero_si256(), U, B))
+
+#define _mm512_cvt_roundpd_epu32(A, B) \
+ ((__m256i)__builtin_ia32_cvtpd2udq512_mask(A, (__v8si)_mm256_undefined_si256(), -1, B))
+
+#define _mm512_mask_cvt_roundpd_epu32(W, U, A, B) \
+ ((__m256i)__builtin_ia32_cvtpd2udq512_mask(A, (__v8si)(W), U, B))
+
+#define _mm512_maskz_cvt_roundpd_epu32(U, A, B) \
+ ((__m256i)__builtin_ia32_cvtpd2udq512_mask(A, (__v8si)_mm256_setzero_si256(), U, B))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundps_epi32 (__m512 __A, const int __R)
+{
+ return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1, __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundps_epi32 (__m512i __W, __mmask16 __U, __m512 __A,
+ const int __R)
+{
+ return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
+ (__v16si) __W,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundps_epi32 (__mmask16 __U, __m512 __A, const int __R)
+{
+ return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundps_epu32 (__m512 __A, const int __R)
+{
+ return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1, __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundps_epu32 (__m512i __W, __mmask16 __U, __m512 __A,
+ const int __R)
+{
+ return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
+ (__v16si) __W,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundps_epu32 (__mmask16 __U, __m512 __A, const int __R)
+{
+ return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U, __R);
+}
+#else
+#define _mm512_cvtt_roundps_epi32(A, B) \
+ ((__m512i)__builtin_ia32_cvttps2dq512_mask(A, (__v16si)_mm512_undefined_epi32 (), -1, B))
+
+#define _mm512_mask_cvtt_roundps_epi32(W, U, A, B) \
+ ((__m512i)__builtin_ia32_cvttps2dq512_mask(A, (__v16si)(W), U, B))
+
+#define _mm512_maskz_cvtt_roundps_epi32(U, A, B) \
+ ((__m512i)__builtin_ia32_cvttps2dq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B))
+
+#define _mm512_cvtt_roundps_epu32(A, B) \
+ ((__m512i)__builtin_ia32_cvttps2udq512_mask(A, (__v16si)_mm512_undefined_epi32 (), -1, B))
+
+#define _mm512_mask_cvtt_roundps_epu32(W, U, A, B) \
+ ((__m512i)__builtin_ia32_cvttps2udq512_mask(A, (__v16si)(W), U, B))
+
+#define _mm512_maskz_cvtt_roundps_epu32(U, A, B) \
+ ((__m512i)__builtin_ia32_cvttps2udq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundps_epi32 (__m512 __A, const int __R)
+{
+ return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1, __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundps_epi32 (__m512i __W, __mmask16 __U, __m512 __A,
+ const int __R)
+{
+ return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
+ (__v16si) __W,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundps_epi32 (__mmask16 __U, __m512 __A, const int __R)
+{
+ return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundps_epu32 (__m512 __A, const int __R)
+{
+ return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1, __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundps_epu32 (__m512i __W, __mmask16 __U, __m512 __A,
+ const int __R)
+{
+ return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
+ (__v16si) __W,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundps_epu32 (__mmask16 __U, __m512 __A, const int __R)
+{
+ return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U, __R);
+}
+#else
+#define _mm512_cvt_roundps_epi32(A, B) \
+ ((__m512i)__builtin_ia32_cvtps2dq512_mask(A, (__v16si)_mm512_undefined_epi32 (), -1, B))
+
+#define _mm512_mask_cvt_roundps_epi32(W, U, A, B) \
+ ((__m512i)__builtin_ia32_cvtps2dq512_mask(A, (__v16si)(W), U, B))
+
+#define _mm512_maskz_cvt_roundps_epi32(U, A, B) \
+ ((__m512i)__builtin_ia32_cvtps2dq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B))
+
+#define _mm512_cvt_roundps_epu32(A, B) \
+ ((__m512i)__builtin_ia32_cvtps2udq512_mask(A, (__v16si)_mm512_undefined_epi32 (), -1, B))
+
+#define _mm512_mask_cvt_roundps_epu32(W, U, A, B) \
+ ((__m512i)__builtin_ia32_cvtps2udq512_mask(A, (__v16si)(W), U, B))
+
+#define _mm512_maskz_cvt_roundps_epu32(U, A, B) \
+ ((__m512i)__builtin_ia32_cvtps2udq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B))
+#endif
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtu32_sd (__m128d __A, unsigned __B)
+{
+ return (__m128d) __builtin_ia32_cvtusi2sd32 ((__v2df) __A, __B);
+}
+
+#ifdef __x86_64__
+#ifdef __OPTIMIZE__
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundu64_sd (__m128d __A, unsigned long long __B, const int __R)
+{
+ return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundi64_sd (__m128d __A, long long __B, const int __R)
+{
+ return (__m128d) __builtin_ia32_cvtsi2sd64 ((__v2df) __A, __B, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsi64_sd (__m128d __A, long long __B, const int __R)
+{
+ return (__m128d) __builtin_ia32_cvtsi2sd64 ((__v2df) __A, __B, __R);
+}
+#else
+#define _mm_cvt_roundu64_sd(A, B, C) \
+ (__m128d)__builtin_ia32_cvtusi2sd64(A, B, C)
+
+#define _mm_cvt_roundi64_sd(A, B, C) \
+ (__m128d)__builtin_ia32_cvtsi2sd64(A, B, C)
+
+#define _mm_cvt_roundsi64_sd(A, B, C) \
+ (__m128d)__builtin_ia32_cvtsi2sd64(A, B, C)
+#endif
+
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundu32_ss (__m128 __A, unsigned __B, const int __R)
+{
+ return (__m128) __builtin_ia32_cvtusi2ss32 ((__v4sf) __A, __B, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsi32_ss (__m128 __A, int __B, const int __R)
+{
+ return (__m128) __builtin_ia32_cvtsi2ss32 ((__v4sf) __A, __B, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundi32_ss (__m128 __A, int __B, const int __R)
+{
+ return (__m128) __builtin_ia32_cvtsi2ss32 ((__v4sf) __A, __B, __R);
+}
+#else
+#define _mm_cvt_roundu32_ss(A, B, C) \
+ (__m128)__builtin_ia32_cvtusi2ss32(A, B, C)
+
+#define _mm_cvt_roundi32_ss(A, B, C) \
+ (__m128)__builtin_ia32_cvtsi2ss32(A, B, C)
+
+#define _mm_cvt_roundsi32_ss(A, B, C) \
+ (__m128)__builtin_ia32_cvtsi2ss32(A, B, C)
+#endif
+
+#ifdef __x86_64__
+#ifdef __OPTIMIZE__
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundu64_ss (__m128 __A, unsigned long long __B, const int __R)
+{
+ return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsi64_ss (__m128 __A, long long __B, const int __R)
+{
+ return (__m128) __builtin_ia32_cvtsi2ss64 ((__v4sf) __A, __B, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundi64_ss (__m128 __A, long long __B, const int __R)
+{
+ return (__m128) __builtin_ia32_cvtsi2ss64 ((__v4sf) __A, __B, __R);
+}
+#else
+#define _mm_cvt_roundu64_ss(A, B, C) \
+ (__m128)__builtin_ia32_cvtusi2ss64(A, B, C)
+
+#define _mm_cvt_roundi64_ss(A, B, C) \
+ (__m128)__builtin_ia32_cvtsi2ss64(A, B, C)
+
+#define _mm_cvt_roundsi64_ss(A, B, C) \
+ (__m128)__builtin_ia32_cvtsi2ss64(A, B, C)
+#endif
+
+#endif
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi32_epi8 (__m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
+ (__v16qi)
+ _mm_undefined_si128 (),
+ (__mmask16) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
+{
+ __builtin_ia32_pmovdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
+ (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtsepi32_epi8 (__m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
+ (__v16qi)
+ _mm_undefined_si128 (),
+ (__mmask16) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
+{
+ __builtin_ia32_pmovsdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
+ (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtsepi32_epi8 (__mmask16 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtusepi32_epi8 (__m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
+ (__v16qi)
+ _mm_undefined_si128 (),
+ (__mmask16) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
+{
+ __builtin_ia32_pmovusdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
+ (__v16qi) __O,
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtusepi32_epi8 (__mmask16 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi32_epi16 (__m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
+ (__v16hi)
+ _mm256_undefined_si256 (),
+ (__mmask16) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi32_storeu_epi16 (void * __P, __mmask16 __M, __m512i __A)
+{
+ __builtin_ia32_pmovdw512mem_mask ((__v16hi *) __P, (__v16si) __A, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
+ (__v16hi) __O, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtsepi32_epi16 (__m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
+ (__v16hi)
+ _mm256_undefined_si256 (),
+ (__mmask16) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
+{
+ __builtin_ia32_pmovsdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
+ (__v16hi) __O, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtsepi32_epi16 (__mmask16 __M, __m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtusepi32_epi16 (__m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
+ (__v16hi)
+ _mm256_undefined_si256 (),
+ (__mmask16) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
+{
+ __builtin_ia32_pmovusdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
+ (__v16hi) __O,
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtusepi32_epi16 (__mmask16 __M, __m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi64_epi32 (__m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
+ (__v8si)
+ _mm256_undefined_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
+{
+ __builtin_ia32_pmovqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
+ (__v8si) __O, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtsepi64_epi32 (__m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
+ (__v8si)
+ _mm256_undefined_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi64_storeu_epi32 (void *__P, __mmask8 __M, __m512i __A)
+{
+ __builtin_ia32_pmovsqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
+ (__v8si) __O, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtsepi64_epi32 (__mmask8 __M, __m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtusepi64_epi32 (__m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
+ (__v8si)
+ _mm256_undefined_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
+{
+ __builtin_ia32_pmovusqd512mem_mask ((__v8si*) __P, (__v8di) __A, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
+ (__v8si) __O, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtusepi64_epi32 (__mmask8 __M, __m512i __A)
+{
+ return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi64_epi16 (__m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
+ (__v8hi)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
+{
+ __builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
+ (__v8hi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtsepi64_epi16 (__m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
+ (__v8hi)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m512i __A)
+{
+ __builtin_ia32_pmovsqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
+ (__v8hi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtsepi64_epi16 (__mmask8 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtusepi64_epi16 (__m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
+ (__v8hi)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
+{
+ __builtin_ia32_pmovusqw512mem_mask ((__v8hi*) __P, (__v8di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
+ (__v8hi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtusepi64_epi16 (__mmask8 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi64_epi8 (__m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
+ (__v16qi)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
+{
+ __builtin_ia32_pmovqb512mem_mask ((unsigned long long *) __P,
+ (__v8di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
+ (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi64_epi8 (__mmask8 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtsepi64_epi8 (__m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
+ (__v16qi)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
+{
+ __builtin_ia32_pmovsqb512mem_mask ((unsigned long long *) __P, (__v8di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
+ (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtsepi64_epi8 (__mmask8 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtusepi64_epi8 (__m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
+ (__v16qi)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
+{
+ __builtin_ia32_pmovusqb512mem_mask ((unsigned long long *) __P, (__v8di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
+ (__v16qi) __O,
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtusepi64_epi8 (__mmask8 __M, __m512i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi32_pd (__m256i __A)
+{
+ return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A)
+{
+ return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A)
+{
+ return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu32_pd (__m256i __A)
+{
+ return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A)
+{
+ return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A)
+{
+ return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepi32_ps (__m512i __A, const int __R)
+{
+ return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepi32_ps (__m512 __W, __mmask16 __U, __m512i __A,
+ const int __R)
+{
+ return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
+ (__v16sf) __W,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepi32_ps (__mmask16 __U, __m512i __A, const int __R)
+{
+ return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepu32_ps (__m512i __A, const int __R)
+{
+ return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepu32_ps (__m512 __W, __mmask16 __U, __m512i __A,
+ const int __R)
+{
+ return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
+ (__v16sf) __W,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepu32_ps (__mmask16 __U, __m512i __A, const int __R)
+{
+ return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U, __R);
+}
+
+#else
+#define _mm512_cvt_roundepi32_ps(A, B) \
+ (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(A), (__v16sf)_mm512_undefined_ps(), -1, B)
+
+#define _mm512_mask_cvt_roundepi32_ps(W, U, A, B) \
+ (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(A), W, U, B)
+
+#define _mm512_maskz_cvt_roundepi32_ps(U, A, B) \
+ (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(A), (__v16sf)_mm512_setzero_ps(), U, B)
+
+#define _mm512_cvt_roundepu32_ps(A, B) \
+ (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(A), (__v16sf)_mm512_undefined_ps(), -1, B)
+
+#define _mm512_mask_cvt_roundepu32_ps(W, U, A, B) \
+ (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(A), W, U, B)
+
+#define _mm512_maskz_cvt_roundepu32_ps(U, A, B) \
+ (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(A), (__v16sf)_mm512_setzero_ps(), U, B)
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_extractf64x4_pd (__m512d __A, const int __imm)
+{
+ return (__m256d) __builtin_ia32_extractf64x4_mask ((__v8df) __A,
+ __imm,
+ (__v4df)
+ _mm256_undefined_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_extractf64x4_pd (__m256d __W, __mmask8 __U, __m512d __A,
+ const int __imm)
+{
+ return (__m256d) __builtin_ia32_extractf64x4_mask ((__v8df) __A,
+ __imm,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_extractf64x4_pd (__mmask8 __U, __m512d __A, const int __imm)
+{
+ return (__m256d) __builtin_ia32_extractf64x4_mask ((__v8df) __A,
+ __imm,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_extractf32x4_ps (__m512 __A, const int __imm)
+{
+ return (__m128) __builtin_ia32_extractf32x4_mask ((__v16sf) __A,
+ __imm,
+ (__v4sf)
+ _mm_undefined_ps (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_extractf32x4_ps (__m128 __W, __mmask8 __U, __m512 __A,
+ const int __imm)
+{
+ return (__m128) __builtin_ia32_extractf32x4_mask ((__v16sf) __A,
+ __imm,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_extractf32x4_ps (__mmask8 __U, __m512 __A, const int __imm)
+{
+ return (__m128) __builtin_ia32_extractf32x4_mask ((__v16sf) __A,
+ __imm,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_extracti64x4_epi64 (__m512i __A, const int __imm)
+{
+ return (__m256i) __builtin_ia32_extracti64x4_mask ((__v8di) __A,
+ __imm,
+ (__v4di)
+ _mm256_undefined_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_extracti64x4_epi64 (__m256i __W, __mmask8 __U, __m512i __A,
+ const int __imm)
+{
+ return (__m256i) __builtin_ia32_extracti64x4_mask ((__v8di) __A,
+ __imm,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_extracti64x4_epi64 (__mmask8 __U, __m512i __A, const int __imm)
+{
+ return (__m256i) __builtin_ia32_extracti64x4_mask ((__v8di) __A,
+ __imm,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_extracti32x4_epi32 (__m512i __A, const int __imm)
+{
+ return (__m128i) __builtin_ia32_extracti32x4_mask ((__v16si) __A,
+ __imm,
+ (__v4si)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_extracti32x4_epi32 (__m128i __W, __mmask8 __U, __m512i __A,
+ const int __imm)
+{
+ return (__m128i) __builtin_ia32_extracti32x4_mask ((__v16si) __A,
+ __imm,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_extracti32x4_epi32 (__mmask8 __U, __m512i __A, const int __imm)
+{
+ return (__m128i) __builtin_ia32_extracti32x4_mask ((__v16si) __A,
+ __imm,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+#else
+
+#define _mm512_extractf64x4_pd(X, C) \
+ ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (X), \
+ (int) (C),\
+ (__v4df)(__m256d)_mm256_undefined_pd(),\
+ (__mmask8)-1))
+
+#define _mm512_mask_extractf64x4_pd(W, U, X, C) \
+ ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (X), \
+ (int) (C),\
+ (__v4df)(__m256d)(W),\
+ (__mmask8)(U)))
+
+#define _mm512_maskz_extractf64x4_pd(U, X, C) \
+ ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (X), \
+ (int) (C),\
+ (__v4df)(__m256d)_mm256_setzero_pd(),\
+ (__mmask8)(U)))
+
+#define _mm512_extractf32x4_ps(X, C) \
+ ((__m128) __builtin_ia32_extractf32x4_mask ((__v16sf)(__m512) (X), \
+ (int) (C),\
+ (__v4sf)(__m128)_mm_undefined_ps(),\
+ (__mmask8)-1))
+
+#define _mm512_mask_extractf32x4_ps(W, U, X, C) \
+ ((__m128) __builtin_ia32_extractf32x4_mask ((__v16sf)(__m512) (X), \
+ (int) (C),\
+ (__v4sf)(__m128)(W),\
+ (__mmask8)(U)))
+
+#define _mm512_maskz_extractf32x4_ps(U, X, C) \
+ ((__m128) __builtin_ia32_extractf32x4_mask ((__v16sf)(__m512) (X), \
+ (int) (C),\
+ (__v4sf)(__m128)_mm_setzero_ps(),\
+ (__mmask8)(U)))
+
+#define _mm512_extracti64x4_epi64(X, C) \
+ ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (X), \
+ (int) (C),\
+ (__v4di)(__m256i)_mm256_undefined_si256 (),\
+ (__mmask8)-1))
+
+#define _mm512_mask_extracti64x4_epi64(W, U, X, C) \
+ ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (X), \
+ (int) (C),\
+ (__v4di)(__m256i)(W),\
+ (__mmask8)(U)))
+
+#define _mm512_maskz_extracti64x4_epi64(U, X, C) \
+ ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (X), \
+ (int) (C),\
+ (__v4di)(__m256i)_mm256_setzero_si256 (),\
+ (__mmask8)(U)))
+
+#define _mm512_extracti32x4_epi32(X, C) \
+ ((__m128i) __builtin_ia32_extracti32x4_mask ((__v16si)(__m512i) (X), \
+ (int) (C),\
+ (__v4si)(__m128i)_mm_undefined_si128 (),\
+ (__mmask8)-1))
+
+#define _mm512_mask_extracti32x4_epi32(W, U, X, C) \
+ ((__m128i) __builtin_ia32_extracti32x4_mask ((__v16si)(__m512i) (X), \
+ (int) (C),\
+ (__v4si)(__m128i)(W),\
+ (__mmask8)(U)))
+
+#define _mm512_maskz_extracti32x4_epi32(U, X, C) \
+ ((__m128i) __builtin_ia32_extracti32x4_mask ((__v16si)(__m512i) (X), \
+ (int) (C),\
+ (__v4si)(__m128i)_mm_setzero_si128 (),\
+ (__mmask8)(U)))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_inserti32x4 (__m512i __A, __m128i __B, const int __imm)
+{
+ return (__m512i) __builtin_ia32_inserti32x4_mask ((__v16si) __A,
+ (__v4si) __B,
+ __imm,
+ (__v16si) __A, -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_insertf32x4 (__m512 __A, __m128 __B, const int __imm)
+{
+ return (__m512) __builtin_ia32_insertf32x4_mask ((__v16sf) __A,
+ (__v4sf) __B,
+ __imm,
+ (__v16sf) __A, -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_inserti64x4 (__m512i __A, __m256i __B, const int __imm)
+{
+ return (__m512i) __builtin_ia32_inserti64x4_mask ((__v8di) __A,
+ (__v4di) __B,
+ __imm,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_inserti64x4 (__m512i __W, __mmask8 __U, __m512i __A,
+ __m256i __B, const int __imm)
+{
+ return (__m512i) __builtin_ia32_inserti64x4_mask ((__v8di) __A,
+ (__v4di) __B,
+ __imm,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_inserti64x4 (__mmask8 __U, __m512i __A, __m256i __B,
+ const int __imm)
+{
+ return (__m512i) __builtin_ia32_inserti64x4_mask ((__v8di) __A,
+ (__v4di) __B,
+ __imm,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_insertf64x4 (__m512d __A, __m256d __B, const int __imm)
+{
+ return (__m512d) __builtin_ia32_insertf64x4_mask ((__v8df) __A,
+ (__v4df) __B,
+ __imm,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_insertf64x4 (__m512d __W, __mmask8 __U, __m512d __A,
+ __m256d __B, const int __imm)
+{
+ return (__m512d) __builtin_ia32_insertf64x4_mask ((__v8df) __A,
+ (__v4df) __B,
+ __imm,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_insertf64x4 (__mmask8 __U, __m512d __A, __m256d __B,
+ const int __imm)
+{
+ return (__m512d) __builtin_ia32_insertf64x4_mask ((__v8df) __A,
+ (__v4df) __B,
+ __imm,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+#else
+#define _mm512_insertf32x4(X, Y, C) \
+ ((__m512) __builtin_ia32_insertf32x4_mask ((__v16sf)(__m512) (X), \
+ (__v4sf)(__m128) (Y), (int) (C), (__v16sf)(__m512) (X), (__mmask16)(-1)))
+
+#define _mm512_inserti32x4(X, Y, C) \
+ ((__m512i) __builtin_ia32_inserti32x4_mask ((__v16si)(__m512i) (X), \
+ (__v4si)(__m128i) (Y), (int) (C), (__v16si)(__m512i) (X), (__mmask16)(-1)))
+
+#define _mm512_insertf64x4(X, Y, C) \
+ ((__m512d) __builtin_ia32_insertf64x4_mask ((__v8df)(__m512d) (X), \
+ (__v4df)(__m256d) (Y), (int) (C), \
+ (__v8df)(__m512d)_mm512_undefined_pd(), \
+ (__mmask8)-1))
+
+#define _mm512_mask_insertf64x4(W, U, X, Y, C) \
+ ((__m512d) __builtin_ia32_insertf64x4_mask ((__v8df)(__m512d) (X), \
+ (__v4df)(__m256d) (Y), (int) (C), \
+ (__v8df)(__m512d)(W), \
+ (__mmask8)(U)))
+
+#define _mm512_maskz_insertf64x4(U, X, Y, C) \
+ ((__m512d) __builtin_ia32_insertf64x4_mask ((__v8df)(__m512d) (X), \
+ (__v4df)(__m256d) (Y), (int) (C), \
+ (__v8df)(__m512d)_mm512_setzero_pd(), \
+ (__mmask8)(U)))
+
+#define _mm512_inserti64x4(X, Y, C) \
+ ((__m512i) __builtin_ia32_inserti64x4_mask ((__v8di)(__m512i) (X), \
+ (__v4di)(__m256i) (Y), (int) (C), \
+ (__v8di)(__m512i)_mm512_undefined_epi32 (), \
+ (__mmask8)-1))
+
+#define _mm512_mask_inserti64x4(W, U, X, Y, C) \
+ ((__m512i) __builtin_ia32_inserti64x4_mask ((__v8di)(__m512i) (X), \
+ (__v4di)(__m256i) (Y), (int) (C),\
+ (__v8di)(__m512i)(W),\
+ (__mmask8)(U)))
+
+#define _mm512_maskz_inserti64x4(U, X, Y, C) \
+ ((__m512i) __builtin_ia32_inserti64x4_mask ((__v8di)(__m512i) (X), \
+ (__v4di)(__m256i) (Y), (int) (C), \
+ (__v8di)(__m512i)_mm512_setzero_si512 (), \
+ (__mmask8)(U)))
+#endif
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_loadu_pd (void const *__P)
+{
+ return *(__m512d_u *)__P;
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void const *__P)
+{
+ return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_loadu_pd (__mmask8 __U, void const *__P)
+{
+ return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_storeu_pd (void *__P, __m512d __A)
+{
+ *(__m512d_u *)__P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_storeu_pd (void *__P, __mmask8 __U, __m512d __A)
+{
+ __builtin_ia32_storeupd512_mask ((double *) __P, (__v8df) __A,
+ (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_loadu_ps (void const *__P)
+{
+ return *(__m512_u *)__P;
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void const *__P)
+{
+ return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_loadu_ps (__mmask16 __U, void const *__P)
+{
+ return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_storeu_ps (void *__P, __m512 __A)
+{
+ *(__m512_u *)__P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_storeu_ps (void *__P, __mmask16 __U, __m512 __A)
+{
+ __builtin_ia32_storeups512_mask ((float *) __P, (__v16sf) __A,
+ (__mmask16) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_load_ss (__m128 __W, __mmask8 __U, const float *__P)
+{
+ return (__m128) __builtin_ia32_loadss_mask (__P, (__v4sf) __W, __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_load_ss (__mmask8 __U, const float *__P)
+{
+ return (__m128) __builtin_ia32_loadss_mask (__P, (__v4sf) _mm_setzero_ps (),
+ __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_load_sd (__m128d __W, __mmask8 __U, const double *__P)
+{
+ return (__m128d) __builtin_ia32_loadsd_mask (__P, (__v2df) __W, __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_load_sd (__mmask8 __U, const double *__P)
+{
+ return (__m128d) __builtin_ia32_loadsd_mask (__P, (__v2df) _mm_setzero_pd (),
+ __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_movess_mask ((__v4sf) __A, (__v4sf) __B,
+ (__v4sf) __W, __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_movess_mask ((__v4sf) __A, (__v4sf) __B,
+ (__v4sf) _mm_setzero_ps (), __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_movesd_mask ((__v2df) __A, (__v2df) __B,
+ (__v2df) __W, __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_movesd_mask ((__v2df) __A, (__v2df) __B,
+ (__v2df) _mm_setzero_pd (),
+ __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_store_ss (float *__P, __mmask8 __U, __m128 __A)
+{
+ __builtin_ia32_storess_mask (__P, (__v4sf) __A, (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_store_sd (double *__P, __mmask8 __U, __m128d __A)
+{
+ __builtin_ia32_storesd_mask (__P, (__v2df) __A, (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_loadu_epi64 (void const *__P)
+{
+ return *(__m512i_u *) __P;
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_loadu_epi64 (__mmask8 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_storeu_epi64 (void *__P, __m512i __A)
+{
+ *(__m512i_u *) __P = (__m512i_u) __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_storeu_epi64 (void *__P, __mmask8 __U, __m512i __A)
+{
+ __builtin_ia32_storedqudi512_mask ((long long *) __P, (__v8di) __A,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_loadu_si512 (void const *__P)
+{
+ return *(__m512i_u *)__P;
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_loadu_epi32 (void const *__P)
+{
+ return *(__m512i_u *) __P;
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_loadu_epi32 (__mmask16 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_storeu_si512 (void *__P, __m512i __A)
+{
+ *(__m512i_u *)__P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_storeu_epi32 (void *__P, __m512i __A)
+{
+ *(__m512i_u *) __P = (__m512i_u) __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_storeu_epi32 (void *__P, __mmask16 __U, __m512i __A)
+{
+ __builtin_ia32_storedqusi512_mask ((int *) __P, (__v16si) __A,
+ (__mmask16) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutevar_pd (__m512d __A, __m512i __C)
+{
+ return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A,
+ (__v8di) __C,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutevar_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512i __C)
+{
+ return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A,
+ (__v8di) __C,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutevar_pd (__mmask8 __U, __m512d __A, __m512i __C)
+{
+ return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A,
+ (__v8di) __C,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutevar_ps (__m512 __A, __m512i __C)
+{
+ return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A,
+ (__v16si) __C,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutevar_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512i __C)
+{
+ return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A,
+ (__v16si) __C,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutevar_ps (__mmask16 __U, __m512 __A, __m512i __C)
+{
+ return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A,
+ (__v16si) __C,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutex2var_epi64 (__m512i __A, __m512i __I, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I
+ /* idx */ ,
+ (__v8di) __A,
+ (__v8di) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutex2var_epi64 (__m512i __A, __mmask8 __U, __m512i __I,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I
+ /* idx */ ,
+ (__v8di) __A,
+ (__v8di) __B,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask2_permutex2var_epi64 (__m512i __A, __m512i __I,
+ __mmask8 __U, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_vpermi2varq512_mask ((__v8di) __A,
+ (__v8di) __I
+ /* idx */ ,
+ (__v8di) __B,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutex2var_epi64 (__mmask8 __U, __m512i __A,
+ __m512i __I, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_vpermt2varq512_maskz ((__v8di) __I
+ /* idx */ ,
+ (__v8di) __A,
+ (__v8di) __B,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutex2var_epi32 (__m512i __A, __m512i __I, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I
+ /* idx */ ,
+ (__v16si) __A,
+ (__v16si) __B,
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutex2var_epi32 (__m512i __A, __mmask16 __U,
+ __m512i __I, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I
+ /* idx */ ,
+ (__v16si) __A,
+ (__v16si) __B,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask2_permutex2var_epi32 (__m512i __A, __m512i __I,
+ __mmask16 __U, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_vpermi2vard512_mask ((__v16si) __A,
+ (__v16si) __I
+ /* idx */ ,
+ (__v16si) __B,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutex2var_epi32 (__mmask16 __U, __m512i __A,
+ __m512i __I, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_vpermt2vard512_maskz ((__v16si) __I
+ /* idx */ ,
+ (__v16si) __A,
+ (__v16si) __B,
+ (__mmask16) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutex2var_pd (__m512d __A, __m512i __I, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I
+ /* idx */ ,
+ (__v8df) __A,
+ (__v8df) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutex2var_pd (__m512d __A, __mmask8 __U, __m512i __I,
+ __m512d __B)
+{
+ return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I
+ /* idx */ ,
+ (__v8df) __A,
+ (__v8df) __B,
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask2_permutex2var_pd (__m512d __A, __m512i __I, __mmask8 __U,
+ __m512d __B)
+{
+ return (__m512d) __builtin_ia32_vpermi2varpd512_mask ((__v8df) __A,
+ (__v8di) __I
+ /* idx */ ,
+ (__v8df) __B,
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutex2var_pd (__mmask8 __U, __m512d __A, __m512i __I,
+ __m512d __B)
+{
+ return (__m512d) __builtin_ia32_vpermt2varpd512_maskz ((__v8di) __I
+ /* idx */ ,
+ (__v8df) __A,
+ (__v8df) __B,
+ (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutex2var_ps (__m512 __A, __m512i __I, __m512 __B)
+{
+ return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I
+ /* idx */ ,
+ (__v16sf) __A,
+ (__v16sf) __B,
+ (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutex2var_ps (__m512 __A, __mmask16 __U, __m512i __I, __m512 __B)
+{
+ return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I
+ /* idx */ ,
+ (__v16sf) __A,
+ (__v16sf) __B,
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask2_permutex2var_ps (__m512 __A, __m512i __I, __mmask16 __U,
+ __m512 __B)
+{
+ return (__m512) __builtin_ia32_vpermi2varps512_mask ((__v16sf) __A,
+ (__v16si) __I
+ /* idx */ ,
+ (__v16sf) __B,
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutex2var_ps (__mmask16 __U, __m512 __A, __m512i __I,
+ __m512 __B)
+{
+ return (__m512) __builtin_ia32_vpermt2varps512_maskz ((__v16si) __I
+ /* idx */ ,
+ (__v16sf) __A,
+ (__v16sf) __B,
+ (__mmask16) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permute_pd (__m512d __X, const int __C)
+{
+ return (__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df) __X, __C,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permute_pd (__m512d __W, __mmask8 __U, __m512d __X, const int __C)
+{
+ return (__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df) __X, __C,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permute_pd (__mmask8 __U, __m512d __X, const int __C)
+{
+ return (__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df) __X, __C,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permute_ps (__m512 __X, const int __C)
+{
+ return (__m512) __builtin_ia32_vpermilps512_mask ((__v16sf) __X, __C,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permute_ps (__m512 __W, __mmask16 __U, __m512 __X, const int __C)
+{
+ return (__m512) __builtin_ia32_vpermilps512_mask ((__v16sf) __X, __C,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permute_ps (__mmask16 __U, __m512 __X, const int __C)
+{
+ return (__m512) __builtin_ia32_vpermilps512_mask ((__v16sf) __X, __C,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+#else
+#define _mm512_permute_pd(X, C) \
+ ((__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df)(__m512d)(X), (int)(C), \
+ (__v8df)(__m512d)_mm512_undefined_pd(),\
+ (__mmask8)(-1)))
+
+#define _mm512_mask_permute_pd(W, U, X, C) \
+ ((__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df)(__m512d)(X), (int)(C), \
+ (__v8df)(__m512d)(W), \
+ (__mmask8)(U)))
+
+#define _mm512_maskz_permute_pd(U, X, C) \
+ ((__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df)(__m512d)(X), (int)(C), \
+ (__v8df)(__m512d)_mm512_setzero_pd(), \
+ (__mmask8)(U)))
+
+#define _mm512_permute_ps(X, C) \
+ ((__m512) __builtin_ia32_vpermilps512_mask ((__v16sf)(__m512)(X), (int)(C), \
+ (__v16sf)(__m512)_mm512_undefined_ps(),\
+ (__mmask16)(-1)))
+
+#define _mm512_mask_permute_ps(W, U, X, C) \
+ ((__m512) __builtin_ia32_vpermilps512_mask ((__v16sf)(__m512)(X), (int)(C), \
+ (__v16sf)(__m512)(W), \
+ (__mmask16)(U)))
+
+#define _mm512_maskz_permute_ps(U, X, C) \
+ ((__m512) __builtin_ia32_vpermilps512_mask ((__v16sf)(__m512)(X), (int)(C), \
+ (__v16sf)(__m512)_mm512_setzero_ps(), \
+ (__mmask16)(U)))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutex_epi64 (__m512i __X, const int __I)
+{
+ return (__m512i) __builtin_ia32_permdi512_mask ((__v8di) __X, __I,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) (-1));
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutex_epi64 (__m512i __W, __mmask8 __M,
+ __m512i __X, const int __I)
+{
+ return (__m512i) __builtin_ia32_permdi512_mask ((__v8di) __X, __I,
+ (__v8di) __W,
+ (__mmask8) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutex_epi64 (__mmask8 __M, __m512i __X, const int __I)
+{
+ return (__m512i) __builtin_ia32_permdi512_mask ((__v8di) __X, __I,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __M);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutex_pd (__m512d __X, const int __M)
+{
+ return (__m512d) __builtin_ia32_permdf512_mask ((__v8df) __X, __M,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutex_pd (__m512d __W, __mmask8 __U, __m512d __X, const int __M)
+{
+ return (__m512d) __builtin_ia32_permdf512_mask ((__v8df) __X, __M,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutex_pd (__mmask8 __U, __m512d __X, const int __M)
+{
+ return (__m512d) __builtin_ia32_permdf512_mask ((__v8df) __X, __M,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+#else
+#define _mm512_permutex_pd(X, M) \
+ ((__m512d) __builtin_ia32_permdf512_mask ((__v8df)(__m512d)(X), (int)(M), \
+ (__v8df)(__m512d)_mm512_undefined_pd(),\
+ (__mmask8)-1))
+
+#define _mm512_mask_permutex_pd(W, U, X, M) \
+ ((__m512d) __builtin_ia32_permdf512_mask ((__v8df)(__m512d)(X), (int)(M), \
+ (__v8df)(__m512d)(W), (__mmask8)(U)))
+
+#define _mm512_maskz_permutex_pd(U, X, M) \
+ ((__m512d) __builtin_ia32_permdf512_mask ((__v8df)(__m512d)(X), (int)(M), \
+ (__v8df)(__m512d)_mm512_setzero_pd(),\
+ (__mmask8)(U)))
+
+#define _mm512_permutex_epi64(X, I) \
+ ((__m512i) __builtin_ia32_permdi512_mask ((__v8di)(__m512i)(X), \
+ (int)(I), \
+ (__v8di)(__m512i) \
+ (_mm512_undefined_epi32 ()),\
+ (__mmask8)(-1)))
+
+#define _mm512_maskz_permutex_epi64(M, X, I) \
+ ((__m512i) __builtin_ia32_permdi512_mask ((__v8di)(__m512i)(X), \
+ (int)(I), \
+ (__v8di)(__m512i) \
+ (_mm512_setzero_si512 ()),\
+ (__mmask8)(M)))
+
+#define _mm512_mask_permutex_epi64(W, M, X, I) \
+ ((__m512i) __builtin_ia32_permdi512_mask ((__v8di)(__m512i)(X), \
+ (int)(I), \
+ (__v8di)(__m512i)(W), \
+ (__mmask8)(M)))
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
+ (__v8di) __X,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutexvar_epi64 (__m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
+ (__v8di) __X,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X,
+ __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
+ (__v8di) __X,
+ (__v8di) __W,
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
+ (__v16si) __X,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutexvar_epi32 (__m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
+ (__v16si) __X,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X,
+ __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
+ (__v16si) __X,
+ (__v16si) __W,
+ __M);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutexvar_pd (__m512i __X, __m512d __Y)
+{
+ return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
+ (__v8di) __X,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y)
+{
+ return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
+ (__v8di) __X,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y)
+{
+ return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
+ (__v8di) __X,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutexvar_ps (__m512i __X, __m512 __Y)
+{
+ return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
+ (__v16si) __X,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y)
+{
+ return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
+ (__v16si) __X,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y)
+{
+ return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
+ (__v16si) __X,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shuffle_ps (__m512 __M, __m512 __V, const int __imm)
+{
+ return (__m512) __builtin_ia32_shufps512_mask ((__v16sf) __M,
+ (__v16sf) __V, __imm,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shuffle_ps (__m512 __W, __mmask16 __U, __m512 __M,
+ __m512 __V, const int __imm)
+{
+ return (__m512) __builtin_ia32_shufps512_mask ((__v16sf) __M,
+ (__v16sf) __V, __imm,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shuffle_ps (__mmask16 __U, __m512 __M, __m512 __V, const int __imm)
+{
+ return (__m512) __builtin_ia32_shufps512_mask ((__v16sf) __M,
+ (__v16sf) __V, __imm,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shuffle_pd (__m512d __M, __m512d __V, const int __imm)
+{
+ return (__m512d) __builtin_ia32_shufpd512_mask ((__v8df) __M,
+ (__v8df) __V, __imm,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shuffle_pd (__m512d __W, __mmask8 __U, __m512d __M,
+ __m512d __V, const int __imm)
+{
+ return (__m512d) __builtin_ia32_shufpd512_mask ((__v8df) __M,
+ (__v8df) __V, __imm,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shuffle_pd (__mmask8 __U, __m512d __M, __m512d __V,
+ const int __imm)
+{
+ return (__m512d) __builtin_ia32_shufpd512_mask ((__v8df) __M,
+ (__v8df) __V, __imm,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fixupimm_round_pd (__m512d __A, __m512d __B, __m512i __C,
+ const int __imm, const int __R)
+{
+ return (__m512d) __builtin_ia32_fixupimmpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8di) __C,
+ __imm,
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fixupimm_round_pd (__m512d __A, __mmask8 __U, __m512d __B,
+ __m512i __C, const int __imm, const int __R)
+{
+ return (__m512d) __builtin_ia32_fixupimmpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8di) __C,
+ __imm,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fixupimm_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+ __m512i __C, const int __imm, const int __R)
+{
+ return (__m512d) __builtin_ia32_fixupimmpd512_maskz ((__v8df) __A,
+ (__v8df) __B,
+ (__v8di) __C,
+ __imm,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fixupimm_round_ps (__m512 __A, __m512 __B, __m512i __C,
+ const int __imm, const int __R)
+{
+ return (__m512) __builtin_ia32_fixupimmps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16si) __C,
+ __imm,
+ (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fixupimm_round_ps (__m512 __A, __mmask16 __U, __m512 __B,
+ __m512i __C, const int __imm, const int __R)
+{
+ return (__m512) __builtin_ia32_fixupimmps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16si) __C,
+ __imm,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fixupimm_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
+ __m512i __C, const int __imm, const int __R)
+{
+ return (__m512) __builtin_ia32_fixupimmps512_maskz ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16si) __C,
+ __imm,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fixupimm_round_sd (__m128d __A, __m128d __B, __m128i __C,
+ const int __imm, const int __R)
+{
+ return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2di) __C, __imm,
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fixupimm_round_sd (__m128d __A, __mmask8 __U, __m128d __B,
+ __m128i __C, const int __imm, const int __R)
+{
+ return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2di) __C, __imm,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fixupimm_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
+ __m128i __C, const int __imm, const int __R)
+{
+ return (__m128d) __builtin_ia32_fixupimmsd_maskz ((__v2df) __A,
+ (__v2df) __B,
+ (__v2di) __C,
+ __imm,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fixupimm_round_ss (__m128 __A, __m128 __B, __m128i __C,
+ const int __imm, const int __R)
+{
+ return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4si) __C, __imm,
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fixupimm_round_ss (__m128 __A, __mmask8 __U, __m128 __B,
+ __m128i __C, const int __imm, const int __R)
+{
+ return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4si) __C, __imm,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fixupimm_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
+ __m128i __C, const int __imm, const int __R)
+{
+ return (__m128) __builtin_ia32_fixupimmss_maskz ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4si) __C, __imm,
+ (__mmask8) __U, __R);
+}
+
+#else
+#define _mm512_shuffle_pd(X, Y, C) \
+ ((__m512d)__builtin_ia32_shufpd512_mask ((__v8df)(__m512d)(X), \
+ (__v8df)(__m512d)(Y), (int)(C),\
+ (__v8df)(__m512d)_mm512_undefined_pd(),\
+ (__mmask8)-1))
+
+#define _mm512_mask_shuffle_pd(W, U, X, Y, C) \
+ ((__m512d)__builtin_ia32_shufpd512_mask ((__v8df)(__m512d)(X), \
+ (__v8df)(__m512d)(Y), (int)(C),\
+ (__v8df)(__m512d)(W),\
+ (__mmask8)(U)))
+
+#define _mm512_maskz_shuffle_pd(U, X, Y, C) \
+ ((__m512d)__builtin_ia32_shufpd512_mask ((__v8df)(__m512d)(X), \
+ (__v8df)(__m512d)(Y), (int)(C),\
+ (__v8df)(__m512d)_mm512_setzero_pd(),\
+ (__mmask8)(U)))
+
+#define _mm512_shuffle_ps(X, Y, C) \
+ ((__m512)__builtin_ia32_shufps512_mask ((__v16sf)(__m512)(X), \
+ (__v16sf)(__m512)(Y), (int)(C),\
+ (__v16sf)(__m512)_mm512_undefined_ps(),\
+ (__mmask16)-1))
+
+#define _mm512_mask_shuffle_ps(W, U, X, Y, C) \
+ ((__m512)__builtin_ia32_shufps512_mask ((__v16sf)(__m512)(X), \
+ (__v16sf)(__m512)(Y), (int)(C),\
+ (__v16sf)(__m512)(W),\
+ (__mmask16)(U)))
+
+#define _mm512_maskz_shuffle_ps(U, X, Y, C) \
+ ((__m512)__builtin_ia32_shufps512_mask ((__v16sf)(__m512)(X), \
+ (__v16sf)(__m512)(Y), (int)(C),\
+ (__v16sf)(__m512)_mm512_setzero_ps(),\
+ (__mmask16)(U)))
+
+#define _mm512_fixupimm_round_pd(X, Y, Z, C, R) \
+ ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X), \
+ (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C), \
+ (__mmask8)(-1), (R)))
+
+#define _mm512_mask_fixupimm_round_pd(X, U, Y, Z, C, R) \
+ ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X), \
+ (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C), \
+ (__mmask8)(U), (R)))
+
+#define _mm512_maskz_fixupimm_round_pd(U, X, Y, Z, C, R) \
+ ((__m512d)__builtin_ia32_fixupimmpd512_maskz ((__v8df)(__m512d)(X), \
+ (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C), \
+ (__mmask8)(U), (R)))
+
+#define _mm512_fixupimm_round_ps(X, Y, Z, C, R) \
+ ((__m512)__builtin_ia32_fixupimmps512_mask ((__v16sf)(__m512)(X), \
+ (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C), \
+ (__mmask16)(-1), (R)))
+
+#define _mm512_mask_fixupimm_round_ps(X, U, Y, Z, C, R) \
+ ((__m512)__builtin_ia32_fixupimmps512_mask ((__v16sf)(__m512)(X), \
+ (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C), \
+ (__mmask16)(U), (R)))
+
+#define _mm512_maskz_fixupimm_round_ps(U, X, Y, Z, C, R) \
+ ((__m512)__builtin_ia32_fixupimmps512_maskz ((__v16sf)(__m512)(X), \
+ (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C), \
+ (__mmask16)(U), (R)))
+
+#define _mm_fixupimm_round_sd(X, Y, Z, C, R) \
+ ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), \
+ (__mmask8)(-1), (R)))
+
+#define _mm_mask_fixupimm_round_sd(X, U, Y, Z, C, R) \
+ ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), \
+ (__mmask8)(U), (R)))
+
+#define _mm_maskz_fixupimm_round_sd(U, X, Y, Z, C, R) \
+ ((__m128d)__builtin_ia32_fixupimmsd_maskz ((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), \
+ (__mmask8)(U), (R)))
+
+#define _mm_fixupimm_round_ss(X, Y, Z, C, R) \
+ ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), \
+ (__mmask8)(-1), (R)))
+
+#define _mm_mask_fixupimm_round_ss(X, U, Y, Z, C, R) \
+ ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), \
+ (__mmask8)(U), (R)))
+
+#define _mm_maskz_fixupimm_round_ss(U, X, Y, Z, C, R) \
+ ((__m128)__builtin_ia32_fixupimmss_maskz ((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), \
+ (__mmask8)(U), (R)))
+#endif
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_movehdup_ps (__m512 __A)
+{
+ return (__m512) __builtin_ia32_movshdup512_mask ((__v16sf) __A,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_movshdup512_mask ((__v16sf) __A,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_movshdup512_mask ((__v16sf) __A,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_moveldup_ps (__m512 __A)
+{
+ return (__m512) __builtin_ia32_movsldup512_mask ((__v16sf) __A,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_movsldup512_mask ((__v16sf) __A,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_movsldup512_mask ((__v16sf) __A,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_or_si512 (__m512i __A, __m512i __B)
+{
+ return (__m512i) ((__v16su) __A | (__v16su) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_or_epi32 (__m512i __A, __m512i __B)
+{
+ return (__m512i) ((__v16su) __A | (__v16su) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_or_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pord512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_or_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pord512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_or_epi64 (__m512i __A, __m512i __B)
+{
+ return (__m512i) ((__v8du) __A | (__v8du) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_or_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_or_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_xor_si512 (__m512i __A, __m512i __B)
+{
+ return (__m512i) ((__v16su) __A ^ (__v16su) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_xor_epi32 (__m512i __A, __m512i __B)
+{
+ return (__m512i) ((__v16su) __A ^ (__v16su) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_xor_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pxord512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_xor_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pxord512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_xor_epi64 (__m512i __A, __m512i __B)
+{
+ return (__m512i) ((__v8du) __A ^ (__v8du) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_xor_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_xor_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rol_epi32 (__m512i __A, const int __B)
+{
+ return (__m512i) __builtin_ia32_prold512_mask ((__v16si) __A, __B,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rol_epi32 (__m512i __W, __mmask16 __U, __m512i __A, const int __B)
+{
+ return (__m512i) __builtin_ia32_prold512_mask ((__v16si) __A, __B,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rol_epi32 (__mmask16 __U, __m512i __A, const int __B)
+{
+ return (__m512i) __builtin_ia32_prold512_mask ((__v16si) __A, __B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_ror_epi32 (__m512i __A, int __B)
+{
+ return (__m512i) __builtin_ia32_prord512_mask ((__v16si) __A, __B,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_ror_epi32 (__m512i __W, __mmask16 __U, __m512i __A, int __B)
+{
+ return (__m512i) __builtin_ia32_prord512_mask ((__v16si) __A, __B,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_ror_epi32 (__mmask16 __U, __m512i __A, int __B)
+{
+ return (__m512i) __builtin_ia32_prord512_mask ((__v16si) __A, __B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rol_epi64 (__m512i __A, const int __B)
+{
+ return (__m512i) __builtin_ia32_prolq512_mask ((__v8di) __A, __B,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rol_epi64 (__m512i __W, __mmask8 __U, __m512i __A, const int __B)
+{
+ return (__m512i) __builtin_ia32_prolq512_mask ((__v8di) __A, __B,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rol_epi64 (__mmask8 __U, __m512i __A, const int __B)
+{
+ return (__m512i) __builtin_ia32_prolq512_mask ((__v8di) __A, __B,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_ror_epi64 (__m512i __A, int __B)
+{
+ return (__m512i) __builtin_ia32_prorq512_mask ((__v8di) __A, __B,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_ror_epi64 (__m512i __W, __mmask8 __U, __m512i __A, int __B)
+{
+ return (__m512i) __builtin_ia32_prorq512_mask ((__v8di) __A, __B,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_ror_epi64 (__mmask8 __U, __m512i __A, int __B)
+{
+ return (__m512i) __builtin_ia32_prorq512_mask ((__v8di) __A, __B,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+#else
+#define _mm512_rol_epi32(A, B) \
+ ((__m512i)__builtin_ia32_prold512_mask ((__v16si)(__m512i)(A), \
+ (int)(B), \
+ (__v16si)_mm512_undefined_epi32 (), \
+ (__mmask16)(-1)))
+#define _mm512_mask_rol_epi32(W, U, A, B) \
+ ((__m512i)__builtin_ia32_prold512_mask ((__v16si)(__m512i)(A), \
+ (int)(B), \
+ (__v16si)(__m512i)(W), \
+ (__mmask16)(U)))
+#define _mm512_maskz_rol_epi32(U, A, B) \
+ ((__m512i)__builtin_ia32_prold512_mask ((__v16si)(__m512i)(A), \
+ (int)(B), \
+ (__v16si)_mm512_setzero_si512 (), \
+ (__mmask16)(U)))
+#define _mm512_ror_epi32(A, B) \
+ ((__m512i)__builtin_ia32_prord512_mask ((__v16si)(__m512i)(A), \
+ (int)(B), \
+ (__v16si)_mm512_undefined_epi32 (), \
+ (__mmask16)(-1)))
+#define _mm512_mask_ror_epi32(W, U, A, B) \
+ ((__m512i)__builtin_ia32_prord512_mask ((__v16si)(__m512i)(A), \
+ (int)(B), \
+ (__v16si)(__m512i)(W), \
+ (__mmask16)(U)))
+#define _mm512_maskz_ror_epi32(U, A, B) \
+ ((__m512i)__builtin_ia32_prord512_mask ((__v16si)(__m512i)(A), \
+ (int)(B), \
+ (__v16si)_mm512_setzero_si512 (), \
+ (__mmask16)(U)))
+#define _mm512_rol_epi64(A, B) \
+ ((__m512i)__builtin_ia32_prolq512_mask ((__v8di)(__m512i)(A), \
+ (int)(B), \
+ (__v8di)_mm512_undefined_epi32 (), \
+ (__mmask8)(-1)))
+#define _mm512_mask_rol_epi64(W, U, A, B) \
+ ((__m512i)__builtin_ia32_prolq512_mask ((__v8di)(__m512i)(A), \
+ (int)(B), \
+ (__v8di)(__m512i)(W), \
+ (__mmask8)(U)))
+#define _mm512_maskz_rol_epi64(U, A, B) \
+ ((__m512i)__builtin_ia32_prolq512_mask ((__v8di)(__m512i)(A), \
+ (int)(B), \
+ (__v8di)_mm512_setzero_si512 (), \
+ (__mmask8)(U)))
+
+#define _mm512_ror_epi64(A, B) \
+ ((__m512i)__builtin_ia32_prorq512_mask ((__v8di)(__m512i)(A), \
+ (int)(B), \
+ (__v8di)_mm512_undefined_epi32 (), \
+ (__mmask8)(-1)))
+#define _mm512_mask_ror_epi64(W, U, A, B) \
+ ((__m512i)__builtin_ia32_prorq512_mask ((__v8di)(__m512i)(A), \
+ (int)(B), \
+ (__v8di)(__m512i)(W), \
+ (__mmask8)(U)))
+#define _mm512_maskz_ror_epi64(U, A, B) \
+ ((__m512i)__builtin_ia32_prorq512_mask ((__v8di)(__m512i)(A), \
+ (int)(B), \
+ (__v8di)_mm512_setzero_si512 (), \
+ (__mmask8)(U)))
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_and_si512 (__m512i __A, __m512i __B)
+{
+ return (__m512i) ((__v16su) __A & (__v16su) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_and_epi32 (__m512i __A, __m512i __B)
+{
+ return (__m512i) ((__v16su) __A & (__v16su) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_and_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pandd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_and_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pandd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_and_epi64 (__m512i __A, __m512i __B)
+{
+ return (__m512i) ((__v8du) __A & (__v8du) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_and_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di) __W, __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_and_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di)
+ _mm512_setzero_pd (),
+ __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_andnot_si512 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_andnot_epi32 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_andnot_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_andnot_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_andnot_epi64 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_andnot_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di) __W, __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_andnot_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di)
+ _mm512_setzero_pd (),
+ __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_test_epi32_mask (__m512i __A, __m512i __B)
+{
+ return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A,
+ (__v16si) __B,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A,
+ (__v16si) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_test_epi64_mask (__m512i __A, __m512i __B)
+{
+ return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A,
+ (__v8di) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A, (__v8di) __B, __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_testn_epi32_mask (__m512i __A, __m512i __B)
+{
+ return (__mmask16) __builtin_ia32_ptestnmd512 ((__v16si) __A,
+ (__v16si) __B,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__mmask16) __builtin_ia32_ptestnmd512 ((__v16si) __A,
+ (__v16si) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_testn_epi64_mask (__m512i __A, __m512i __B)
+{
+ return (__mmask8) __builtin_ia32_ptestnmq512 ((__v8di) __A,
+ (__v8di) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__mmask8) __builtin_ia32_ptestnmq512 ((__v8di) __A,
+ (__v8di) __B, __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_abs_ps (__m512 __A)
+{
+ return (__m512) _mm512_and_epi32 ((__m512i) __A,
+ _mm512_set1_epi32 (0x7fffffff));
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_abs_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+ return (__m512) _mm512_mask_and_epi32 ((__m512i) __W, __U, (__m512i) __A,
+ _mm512_set1_epi32 (0x7fffffff));
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_abs_pd (__m512d __A)
+{
+ return (__m512d) _mm512_and_epi64 ((__m512i) __A,
+ _mm512_set1_epi64 (0x7fffffffffffffffLL));
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_abs_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+ return (__m512d)
+ _mm512_mask_and_epi64 ((__m512i) __W, __U, (__m512i) __A,
+ _mm512_set1_epi64 (0x7fffffffffffffffLL));
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_unpackhi_epi32 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_punpckhdq512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_unpackhi_epi32 (__m512i __W, __mmask16 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_punpckhdq512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_unpackhi_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_punpckhdq512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_unpackhi_epi64 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_punpckhqdq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_unpackhi_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_punpckhqdq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_unpackhi_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_punpckhqdq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_unpacklo_epi32 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_punpckldq512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_unpacklo_epi32 (__m512i __W, __mmask16 __U, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_punpckldq512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_unpacklo_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_punpckldq512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_unpacklo_epi64 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_punpcklqdq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_punpcklqdq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_punpcklqdq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+#ifdef __x86_64__
+#ifdef __OPTIMIZE__
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundss_u64 (__m128 __A, const int __R)
+{
+ return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf) __A, __R);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundss_si64 (__m128 __A, const int __R)
+{
+ return (long long) __builtin_ia32_vcvtss2si64 ((__v4sf) __A, __R);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundss_i64 (__m128 __A, const int __R)
+{
+ return (long long) __builtin_ia32_vcvtss2si64 ((__v4sf) __A, __R);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundss_u64 (__m128 __A, const int __R)
+{
+ return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf) __A, __R);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundss_i64 (__m128 __A, const int __R)
+{
+ return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, __R);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundss_si64 (__m128 __A, const int __R)
+{
+ return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, __R);
+}
+#else
+#define _mm_cvt_roundss_u64(A, B) \
+ ((unsigned long long)__builtin_ia32_vcvtss2usi64(A, B))
+
+#define _mm_cvt_roundss_si64(A, B) \
+ ((long long)__builtin_ia32_vcvtss2si64(A, B))
+
+#define _mm_cvt_roundss_i64(A, B) \
+ ((long long)__builtin_ia32_vcvtss2si64(A, B))
+
+#define _mm_cvtt_roundss_u64(A, B) \
+ ((unsigned long long)__builtin_ia32_vcvttss2usi64(A, B))
+
+#define _mm_cvtt_roundss_i64(A, B) \
+ ((long long)__builtin_ia32_vcvttss2si64(A, B))
+
+#define _mm_cvtt_roundss_si64(A, B) \
+ ((long long)__builtin_ia32_vcvttss2si64(A, B))
+#endif
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundss_u32 (__m128 __A, const int __R)
+{
+ return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A, __R);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundss_si32 (__m128 __A, const int __R)
+{
+ return (int) __builtin_ia32_vcvtss2si32 ((__v4sf) __A, __R);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundss_i32 (__m128 __A, const int __R)
+{
+ return (int) __builtin_ia32_vcvtss2si32 ((__v4sf) __A, __R);
+}
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundss_u32 (__m128 __A, const int __R)
+{
+ return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A, __R);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundss_i32 (__m128 __A, const int __R)
+{
+ return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, __R);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundss_si32 (__m128 __A, const int __R)
+{
+ return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, __R);
+}
+#else
+#define _mm_cvt_roundss_u32(A, B) \
+ ((unsigned)__builtin_ia32_vcvtss2usi32(A, B))
+
+#define _mm_cvt_roundss_si32(A, B) \
+ ((int)__builtin_ia32_vcvtss2si32(A, B))
+
+#define _mm_cvt_roundss_i32(A, B) \
+ ((int)__builtin_ia32_vcvtss2si32(A, B))
+
+#define _mm_cvtt_roundss_u32(A, B) \
+ ((unsigned)__builtin_ia32_vcvttss2usi32(A, B))
+
+#define _mm_cvtt_roundss_si32(A, B) \
+ ((int)__builtin_ia32_vcvttss2si32(A, B))
+
+#define _mm_cvtt_roundss_i32(A, B) \
+ ((int)__builtin_ia32_vcvttss2si32(A, B))
+#endif
+
+#ifdef __x86_64__
+#ifdef __OPTIMIZE__
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsd_u64 (__m128d __A, const int __R)
+{
+ return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df) __A, __R);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsd_si64 (__m128d __A, const int __R)
+{
+ return (long long) __builtin_ia32_vcvtsd2si64 ((__v2df) __A, __R);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsd_i64 (__m128d __A, const int __R)
+{
+ return (long long) __builtin_ia32_vcvtsd2si64 ((__v2df) __A, __R);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsd_u64 (__m128d __A, const int __R)
+{
+ return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df) __A, __R);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsd_si64 (__m128d __A, const int __R)
+{
+ return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, __R);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsd_i64 (__m128d __A, const int __R)
+{
+ return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, __R);
+}
+#else
+#define _mm_cvt_roundsd_u64(A, B) \
+ ((unsigned long long)__builtin_ia32_vcvtsd2usi64(A, B))
+
+#define _mm_cvt_roundsd_si64(A, B) \
+ ((long long)__builtin_ia32_vcvtsd2si64(A, B))
+
+#define _mm_cvt_roundsd_i64(A, B) \
+ ((long long)__builtin_ia32_vcvtsd2si64(A, B))
+
+#define _mm_cvtt_roundsd_u64(A, B) \
+ ((unsigned long long)__builtin_ia32_vcvttsd2usi64(A, B))
+
+#define _mm_cvtt_roundsd_si64(A, B) \
+ ((long long)__builtin_ia32_vcvttsd2si64(A, B))
+
+#define _mm_cvtt_roundsd_i64(A, B) \
+ ((long long)__builtin_ia32_vcvttsd2si64(A, B))
+#endif
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsd_u32 (__m128d __A, const int __R)
+{
+ return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A, __R);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsd_si32 (__m128d __A, const int __R)
+{
+ return (int) __builtin_ia32_vcvtsd2si32 ((__v2df) __A, __R);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsd_i32 (__m128d __A, const int __R)
+{
+ return (int) __builtin_ia32_vcvtsd2si32 ((__v2df) __A, __R);
+}
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsd_u32 (__m128d __A, const int __R)
+{
+ return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A, __R);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsd_i32 (__m128d __A, const int __R)
+{
+ return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, __R);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsd_si32 (__m128d __A, const int __R)
+{
+ return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, __R);
+}
+#else
+#define _mm_cvt_roundsd_u32(A, B) \
+ ((unsigned)__builtin_ia32_vcvtsd2usi32(A, B))
+
+#define _mm_cvt_roundsd_si32(A, B) \
+ ((int)__builtin_ia32_vcvtsd2si32(A, B))
+
+#define _mm_cvt_roundsd_i32(A, B) \
+ ((int)__builtin_ia32_vcvtsd2si32(A, B))
+
+#define _mm_cvtt_roundsd_u32(A, B) \
+ ((unsigned)__builtin_ia32_vcvttsd2usi32(A, B))
+
+#define _mm_cvtt_roundsd_si32(A, B) \
+ ((int)__builtin_ia32_vcvttsd2si32(A, B))
+
+#define _mm_cvtt_roundsd_i32(A, B) \
+ ((int)__builtin_ia32_vcvttsd2si32(A, B))
+#endif
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_movedup_pd (__m512d __A)
+{
+ return (__m512d) __builtin_ia32_movddup512_mask ((__v8df) __A,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_movedup_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+ return (__m512d) __builtin_ia32_movddup512_mask ((__v8df) __A,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A)
+{
+ return (__m512d) __builtin_ia32_movddup512_mask ((__v8df) __A,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_unpacklo_pd (__m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_unpcklpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_unpacklo_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_unpcklpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_unpcklpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_unpackhi_pd (__m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_unpckhpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_unpackhi_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_unpckhpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_unpackhi_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_unpckhpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_unpackhi_ps (__m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_unpckhps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_unpackhi_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_unpckhps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_unpckhps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundps_pd (__m256 __A, const int __R)
+{
+ return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundps_pd (__m512d __W, __mmask8 __U, __m256 __A,
+ const int __R)
+{
+ return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
+ (__v8df) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundps_pd (__mmask8 __U, __m256 __A, const int __R)
+{
+ return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundph_ps (__m256i __A, const int __R)
+{
+ return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundph_ps (__m512 __W, __mmask16 __U, __m256i __A,
+ const int __R)
+{
+ return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
+ (__v16sf) __W,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundph_ps (__mmask16 __U, __m256i __A, const int __R)
+{
+ return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundps_ph (__m512 __A, const int __I)
+{
+ return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A,
+ __I,
+ (__v16hi)
+ _mm256_undefined_si256 (),
+ -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtps_ph (__m512 __A, const int __I)
+{
+ return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A,
+ __I,
+ (__v16hi)
+ _mm256_undefined_si256 (),
+ -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundps_ph (__m256i __U, __mmask16 __W, __m512 __A,
+ const int __I)
+{
+ return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A,
+ __I,
+ (__v16hi) __U,
+ (__mmask16) __W);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtps_ph (__m256i __U, __mmask16 __W, __m512 __A, const int __I)
+{
+ return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A,
+ __I,
+ (__v16hi) __U,
+ (__mmask16) __W);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundps_ph (__mmask16 __W, __m512 __A, const int __I)
+{
+ return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A,
+ __I,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __W);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtps_ph (__mmask16 __W, __m512 __A, const int __I)
+{
+ return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A,
+ __I,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __W);
+}
+#else
+#define _mm512_cvt_roundps_pd(A, B) \
+ (__m512d)__builtin_ia32_cvtps2pd512_mask(A, (__v8df)_mm512_undefined_pd(), -1, B)
+
+#define _mm512_mask_cvt_roundps_pd(W, U, A, B) \
+ (__m512d)__builtin_ia32_cvtps2pd512_mask(A, (__v8df)(W), U, B)
+
+#define _mm512_maskz_cvt_roundps_pd(U, A, B) \
+ (__m512d)__builtin_ia32_cvtps2pd512_mask(A, (__v8df)_mm512_setzero_pd(), U, B)
+
+#define _mm512_cvt_roundph_ps(A, B) \
+ (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(A), (__v16sf)_mm512_undefined_ps(), -1, B)
+
+#define _mm512_mask_cvt_roundph_ps(W, U, A, B) \
+ (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(A), (__v16sf)(W), U, B)
+
+#define _mm512_maskz_cvt_roundph_ps(U, A, B) \
+ (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(A), (__v16sf)_mm512_setzero_ps(), U, B)
+
+#define _mm512_cvt_roundps_ph(A, I) \
+ ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) (A), (int) (I),\
+ (__v16hi)_mm256_undefined_si256 (), -1))
+#define _mm512_cvtps_ph(A, I) \
+ ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) (A), (int) (I),\
+ (__v16hi)_mm256_undefined_si256 (), -1))
+#define _mm512_mask_cvt_roundps_ph(U, W, A, I) \
+ ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) (A), (int) (I),\
+ (__v16hi)(__m256i)(U), (__mmask16) (W)))
+#define _mm512_mask_cvtps_ph(U, W, A, I) \
+ ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) (A), (int) (I),\
+ (__v16hi)(__m256i)(U), (__mmask16) (W)))
+#define _mm512_maskz_cvt_roundps_ph(W, A, I) \
+ ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) (A), (int) (I),\
+ (__v16hi)_mm256_setzero_si256 (), (__mmask16) (W)))
+#define _mm512_maskz_cvtps_ph(W, A, I) \
+ ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) (A), (int) (I),\
+ (__v16hi)_mm256_setzero_si256 (), (__mmask16) (W)))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundpd_ps (__m512d __A, const int __R)
+{
+ return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
+ (__v8sf)
+ _mm256_undefined_ps (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundpd_ps (__m256 __W, __mmask8 __U, __m512d __A,
+ const int __R)
+{
+ return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
+ (__v8sf) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundpd_ps (__mmask8 __U, __m512d __A, const int __R)
+{
+ return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsd_ss (__m128 __A, __m128d __B, const int __R)
+{
+ return (__m128) __builtin_ia32_cvtsd2ss_round ((__v4sf) __A,
+ (__v2df) __B,
+ __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvt_roundsd_ss (__m128 __W, __mmask8 __U, __m128 __A,
+ __m128d __B, const int __R)
+{
+ return (__m128) __builtin_ia32_cvtsd2ss_mask_round ((__v4sf) __A,
+ (__v2df) __B,
+ (__v4sf) __W,
+ __U,
+ __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvt_roundsd_ss (__mmask8 __U, __m128 __A,
+ __m128d __B, const int __R)
+{
+ return (__m128) __builtin_ia32_cvtsd2ss_mask_round ((__v4sf) __A,
+ (__v2df) __B,
+ _mm_setzero_ps (),
+ __U,
+ __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundss_sd (__m128d __A, __m128 __B, const int __R)
+{
+ return (__m128d) __builtin_ia32_cvtss2sd_round ((__v2df) __A,
+ (__v4sf) __B,
+ __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvt_roundss_sd (__m128d __W, __mmask8 __U, __m128d __A,
+ __m128 __B, const int __R)
+{
+ return (__m128d) __builtin_ia32_cvtss2sd_mask_round ((__v2df) __A,
+ (__v4sf) __B,
+ (__v2df) __W,
+ __U,
+ __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvt_roundss_sd (__mmask8 __U, __m128d __A,
+ __m128 __B, const int __R)
+{
+ return (__m128d) __builtin_ia32_cvtss2sd_mask_round ((__v2df) __A,
+ (__v4sf) __B,
+ _mm_setzero_pd (),
+ __U,
+ __R);
+}
+#else
+#define _mm512_cvt_roundpd_ps(A, B) \
+ (__m256)__builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)_mm256_undefined_ps(), -1, B)
+
+#define _mm512_mask_cvt_roundpd_ps(W, U, A, B) \
+ (__m256)__builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)(W), U, B)
+
+#define _mm512_maskz_cvt_roundpd_ps(U, A, B) \
+ (__m256)__builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)_mm256_setzero_ps(), U, B)
+
+#define _mm_cvt_roundsd_ss(A, B, C) \
+ (__m128)__builtin_ia32_cvtsd2ss_round(A, B, C)
+
+#define _mm_mask_cvt_roundsd_ss(W, U, A, B, C) \
+ (__m128)__builtin_ia32_cvtsd2ss_mask_round ((A), (B), (W), (U), (C))
+
+#define _mm_maskz_cvt_roundsd_ss(U, A, B, C) \
+ (__m128)__builtin_ia32_cvtsd2ss_mask_round ((A), (B), _mm_setzero_ps (), \
+ (U), (C))
+
+#define _mm_cvt_roundss_sd(A, B, C) \
+ (__m128d)__builtin_ia32_cvtss2sd_round(A, B, C)
+
+#define _mm_mask_cvt_roundss_sd(W, U, A, B, C) \
+ (__m128d)__builtin_ia32_cvtss2sd_mask_round ((A), (B), (W), (U), (C))
+
+#define _mm_maskz_cvt_roundss_sd(U, A, B, C) \
+ (__m128d)__builtin_ia32_cvtss2sd_mask_round ((A), (B), _mm_setzero_pd (), \
+ (U), (C))
+
+#endif
+
+#define _mm_mask_cvtss_sd(W, U, A, B) \
+ _mm_mask_cvt_roundss_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_cvtss_sd(U, A, B) \
+ _mm_maskz_cvt_roundss_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_cvtsd_ss(W, U, A, B) \
+ _mm_mask_cvt_roundsd_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_cvtsd_ss(U, A, B) \
+ _mm_maskz_cvt_roundsd_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_stream_si512 (__m512i * __P, __m512i __A)
+{
+ __builtin_ia32_movntdq512 ((__v8di *) __P, (__v8di) __A);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_stream_ps (float *__P, __m512 __A)
+{
+ __builtin_ia32_movntps512 (__P, (__v16sf) __A);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_stream_pd (double *__P, __m512d __A)
+{
+ __builtin_ia32_movntpd512 (__P, (__v8df) __A);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_stream_load_si512 (void *__P)
+{
+ return __builtin_ia32_movntdqa512 ((__v8di *)__P);
+}
+
+/* Constants for mantissa extraction */
+typedef enum
+{
+ _MM_MANT_NORM_1_2, /* interval [1, 2) */
+ _MM_MANT_NORM_p5_2, /* interval [0.5, 2) */
+ _MM_MANT_NORM_p5_1, /* interval [0.5, 1) */
+ _MM_MANT_NORM_p75_1p5 /* interval [0.75, 1.5) */
+} _MM_MANTISSA_NORM_ENUM;
+
+typedef enum
+{
+ _MM_MANT_SIGN_src, /* sign = sign(SRC) */
+ _MM_MANT_SIGN_zero, /* sign = 0 */
+ _MM_MANT_SIGN_nan /* DEST = NaN if sign(SRC) = 1 */
+} _MM_MANTISSA_SIGN_ENUM;
+
+#ifdef __OPTIMIZE__
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getexp_round_ss (__m128 __A, __m128 __B, const int __R)
+{
+ return (__m128) __builtin_ia32_getexpss128_round ((__v4sf) __A,
+ (__v4sf) __B,
+ __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getexp_round_ss (__m128 __W, __mmask8 __U, __m128 __A,
+ __m128 __B, const int __R)
+{
+ return (__m128) __builtin_ia32_getexpss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getexp_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
+ const int __R)
+{
+ return (__m128) __builtin_ia32_getexpss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getexp_round_sd (__m128d __A, __m128d __B, const int __R)
+{
+ return (__m128d) __builtin_ia32_getexpsd128_round ((__v2df) __A,
+ (__v2df) __B,
+ __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getexp_round_sd (__m128d __W, __mmask8 __U, __m128d __A,
+ __m128d __B, const int __R)
+{
+ return (__m128d) __builtin_ia32_getexpsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getexp_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
+ const int __R)
+{
+ return (__m128d) __builtin_ia32_getexpsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getexp_round_ps (__m512 __A, const int __R)
+{
+ return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getexp_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
+ const int __R)
+{
+ return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+ (__v16sf) __W,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getexp_round_ps (__mmask16 __U, __m512 __A, const int __R)
+{
+ return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getexp_round_pd (__m512d __A, const int __R)
+{
+ return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getexp_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
+ const int __R)
+{
+ return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+ (__v8df) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getexp_round_pd (__mmask8 __U, __m512d __A, const int __R)
+{
+ return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getmant_round_pd (__m512d __A, _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C, const int __R)
+{
+ return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A,
+ (__C << 2) | __B,
+ _mm512_undefined_pd (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getmant_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
+ _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C, const int __R)
+{
+ return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A,
+ (__C << 2) | __B,
+ (__v8df) __W, __U,
+ __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getmant_round_pd (__mmask8 __U, __m512d __A,
+ _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C, const int __R)
+{
+ return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A,
+ (__C << 2) | __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getmant_round_ps (__m512 __A, _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C, const int __R)
+{
+ return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A,
+ (__C << 2) | __B,
+ _mm512_undefined_ps (),
+ (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getmant_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
+ _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C, const int __R)
+{
+ return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A,
+ (__C << 2) | __B,
+ (__v16sf) __W, __U,
+ __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getmant_round_ps (__mmask16 __U, __m512 __A,
+ _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C, const int __R)
+{
+ return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A,
+ (__C << 2) | __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getmant_round_sd (__m128d __A, __m128d __B,
+ _MM_MANTISSA_NORM_ENUM __C,
+ _MM_MANTISSA_SIGN_ENUM __D, const int __R)
+{
+ return (__m128d) __builtin_ia32_getmantsd_round ((__v2df) __A,
+ (__v2df) __B,
+ (__D << 2) | __C,
+ __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getmant_round_sd (__m128d __W, __mmask8 __U, __m128d __A,
+ __m128d __B, _MM_MANTISSA_NORM_ENUM __C,
+ _MM_MANTISSA_SIGN_ENUM __D, const int __R)
+{
+ return (__m128d) __builtin_ia32_getmantsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__D << 2) | __C,
+ (__v2df) __W,
+ __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getmant_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
+ _MM_MANTISSA_NORM_ENUM __C,
+ _MM_MANTISSA_SIGN_ENUM __D, const int __R)
+{
+ return (__m128d) __builtin_ia32_getmantsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__D << 2) | __C,
+ (__v2df)
+ _mm_setzero_pd(),
+ __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getmant_round_ss (__m128 __A, __m128 __B,
+ _MM_MANTISSA_NORM_ENUM __C,
+ _MM_MANTISSA_SIGN_ENUM __D, const int __R)
+{
+ return (__m128) __builtin_ia32_getmantss_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__D << 2) | __C,
+ __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getmant_round_ss (__m128 __W, __mmask8 __U, __m128 __A,
+ __m128 __B, _MM_MANTISSA_NORM_ENUM __C,
+ _MM_MANTISSA_SIGN_ENUM __D, const int __R)
+{
+ return (__m128) __builtin_ia32_getmantss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__D << 2) | __C,
+ (__v4sf) __W,
+ __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getmant_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
+ _MM_MANTISSA_NORM_ENUM __C,
+ _MM_MANTISSA_SIGN_ENUM __D, const int __R)
+{
+ return (__m128) __builtin_ia32_getmantss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__D << 2) | __C,
+ (__v4sf)
+ _mm_setzero_ps(),
+ __U, __R);
+}
+
+#else
+#define _mm512_getmant_round_pd(X, B, C, R) \
+ ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X), \
+ (int)(((C)<<2) | (B)), \
+ (__v8df)(__m512d)_mm512_undefined_pd(), \
+ (__mmask8)-1,\
+ (R)))
+
+#define _mm512_mask_getmant_round_pd(W, U, X, B, C, R) \
+ ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X), \
+ (int)(((C)<<2) | (B)), \
+ (__v8df)(__m512d)(W), \
+ (__mmask8)(U),\
+ (R)))
+
+#define _mm512_maskz_getmant_round_pd(U, X, B, C, R) \
+ ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X), \
+ (int)(((C)<<2) | (B)), \
+ (__v8df)(__m512d)_mm512_setzero_pd(), \
+ (__mmask8)(U),\
+ (R)))
+#define _mm512_getmant_round_ps(X, B, C, R) \
+ ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X), \
+ (int)(((C)<<2) | (B)), \
+ (__v16sf)(__m512)_mm512_undefined_ps(), \
+ (__mmask16)-1,\
+ (R)))
+
+#define _mm512_mask_getmant_round_ps(W, U, X, B, C, R) \
+ ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X), \
+ (int)(((C)<<2) | (B)), \
+ (__v16sf)(__m512)(W), \
+ (__mmask16)(U),\
+ (R)))
+
+#define _mm512_maskz_getmant_round_ps(U, X, B, C, R) \
+ ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X), \
+ (int)(((C)<<2) | (B)), \
+ (__v16sf)(__m512)_mm512_setzero_ps(), \
+ (__mmask16)(U),\
+ (R)))
+#define _mm_getmant_round_sd(X, Y, C, D, R) \
+ ((__m128d)__builtin_ia32_getmantsd_round ((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), \
+ (int)(((D)<<2) | (C)), \
+ (R)))
+
+#define _mm_mask_getmant_round_sd(W, U, X, Y, C, D, R) \
+ ((__m128d)__builtin_ia32_getmantsd_mask_round ((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), \
+ (int)(((D)<<2) | (C)), \
+ (__v2df)(__m128d)(W), \
+ (__mmask8)(U),\
+ (R)))
+
+#define _mm_maskz_getmant_round_sd(U, X, Y, C, D, R) \
+ ((__m128d)__builtin_ia32_getmantsd_mask_round ((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), \
+ (int)(((D)<<2) | (C)), \
+ (__v2df)(__m128d)_mm_setzero_pd(), \
+ (__mmask8)(U),\
+ (R)))
+
+#define _mm_getmant_round_ss(X, Y, C, D, R) \
+ ((__m128)__builtin_ia32_getmantss_round ((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), \
+ (int)(((D)<<2) | (C)), \
+ (R)))
+
+#define _mm_mask_getmant_round_ss(W, U, X, Y, C, D, R) \
+ ((__m128)__builtin_ia32_getmantss_mask_round ((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), \
+ (int)(((D)<<2) | (C)), \
+ (__v4sf)(__m128)(W), \
+ (__mmask8)(U),\
+ (R)))
+
+#define _mm_maskz_getmant_round_ss(U, X, Y, C, D, R) \
+ ((__m128)__builtin_ia32_getmantss_mask_round ((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), \
+ (int)(((D)<<2) | (C)), \
+ (__v4sf)(__m128)_mm_setzero_ps(), \
+ (__mmask8)(U),\
+ (R)))
+
+#define _mm_getexp_round_ss(A, B, R) \
+ ((__m128)__builtin_ia32_getexpss128_round((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), R))
+
+#define _mm_mask_getexp_round_ss(W, U, A, B, C) \
+ (__m128)__builtin_ia32_getexpss_mask_round(A, B, W, U, C)
+
+#define _mm_maskz_getexp_round_ss(U, A, B, C) \
+ (__m128)__builtin_ia32_getexpss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C)
+
+#define _mm_getexp_round_sd(A, B, R) \
+ ((__m128d)__builtin_ia32_getexpsd128_round((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), R))
+
+#define _mm_mask_getexp_round_sd(W, U, A, B, C) \
+ (__m128d)__builtin_ia32_getexpsd_mask_round(A, B, W, U, C)
+
+#define _mm_maskz_getexp_round_sd(U, A, B, C) \
+ (__m128d)__builtin_ia32_getexpsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C)
+
+
+#define _mm512_getexp_round_ps(A, R) \
+ ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, R))
+
+#define _mm512_mask_getexp_round_ps(W, U, A, R) \
+ ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(W), (__mmask16)(U), R))
+
+#define _mm512_maskz_getexp_round_ps(U, A, R) \
+ ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), R))
+
+#define _mm512_getexp_round_pd(A, R) \
+ ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)_mm512_undefined_pd(), (__mmask8)-1, R))
+
+#define _mm512_mask_getexp_round_pd(W, U, A, R) \
+ ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(W), (__mmask8)(U), R))
+
+#define _mm512_maskz_getexp_round_pd(U, A, R) \
+ ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)_mm512_setzero_pd(), (__mmask8)(U), R))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_roundscale_round_ps (__m512 __A, const int __imm, const int __R)
+{
+ return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, __imm,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_roundscale_round_ps (__m512 __A, __mmask16 __B, __m512 __C,
+ const int __imm, const int __R)
+{
+ return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __C, __imm,
+ (__v16sf) __A,
+ (__mmask16) __B, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_roundscale_round_ps (__mmask16 __A, __m512 __B,
+ const int __imm, const int __R)
+{
+ return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __B,
+ __imm,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __A, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_roundscale_round_pd (__m512d __A, const int __imm, const int __R)
+{
+ return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, __imm,
+ (__v8df)
+ _mm512_undefined_pd (),
+ -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_roundscale_round_pd (__m512d __A, __mmask8 __B,
+ __m512d __C, const int __imm, const int __R)
+{
+ return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __C, __imm,
+ (__v8df) __A,
+ (__mmask8) __B, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_roundscale_round_pd (__mmask8 __A, __m512d __B,
+ const int __imm, const int __R)
+{
+ return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __B,
+ __imm,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __A, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roundscale_round_ss (__m128 __A, __m128 __B, const int __imm,
+ const int __R)
+{
+ return (__m128)
+ __builtin_ia32_rndscaless_mask_round ((__v4sf) __A,
+ (__v4sf) __B, __imm,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_roundscale_round_ss (__m128 __A, __mmask8 __B, __m128 __C,
+ __m128 __D, const int __imm, const int __R)
+{
+ return (__m128)
+ __builtin_ia32_rndscaless_mask_round ((__v4sf) __C,
+ (__v4sf) __D, __imm,
+ (__v4sf) __A,
+ (__mmask8) __B,
+ __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_roundscale_round_ss (__mmask8 __A, __m128 __B, __m128 __C,
+ const int __imm, const int __R)
+{
+ return (__m128)
+ __builtin_ia32_rndscaless_mask_round ((__v4sf) __B,
+ (__v4sf) __C, __imm,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __A,
+ __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roundscale_round_sd (__m128d __A, __m128d __B, const int __imm,
+ const int __R)
+{
+ return (__m128d)
+ __builtin_ia32_rndscalesd_mask_round ((__v2df) __A,
+ (__v2df) __B, __imm,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_roundscale_round_sd (__m128d __A, __mmask8 __B, __m128d __C,
+ __m128d __D, const int __imm, const int __R)
+{
+ return (__m128d)
+ __builtin_ia32_rndscalesd_mask_round ((__v2df) __C,
+ (__v2df) __D, __imm,
+ (__v2df) __A,
+ (__mmask8) __B,
+ __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_roundscale_round_sd (__mmask8 __A, __m128d __B, __m128d __C,
+ const int __imm, const int __R)
+{
+ return (__m128d)
+ __builtin_ia32_rndscalesd_mask_round ((__v2df) __B,
+ (__v2df) __C, __imm,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __A,
+ __R);
+}
+
+#else
+#define _mm512_roundscale_round_ps(A, B, R) \
+ ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(A), (int)(B),\
+ (__v16sf)_mm512_undefined_ps(), (__mmask16)(-1), R))
+#define _mm512_mask_roundscale_round_ps(A, B, C, D, R) \
+ ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(C), \
+ (int)(D), \
+ (__v16sf)(__m512)(A), \
+ (__mmask16)(B), R))
+#define _mm512_maskz_roundscale_round_ps(A, B, C, R) \
+ ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(B), \
+ (int)(C), \
+ (__v16sf)_mm512_setzero_ps(),\
+ (__mmask16)(A), R))
+#define _mm512_roundscale_round_pd(A, B, R) \
+ ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(A), (int)(B),\
+ (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), R))
+#define _mm512_mask_roundscale_round_pd(A, B, C, D, R) \
+ ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(C), \
+ (int)(D), \
+ (__v8df)(__m512d)(A), \
+ (__mmask8)(B), R))
+#define _mm512_maskz_roundscale_round_pd(A, B, C, R) \
+ ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(B), \
+ (int)(C), \
+ (__v8df)_mm512_setzero_pd(),\
+ (__mmask8)(A), R))
+#define _mm_roundscale_round_ss(A, B, I, R) \
+ ((__m128) \
+ __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (A), \
+ (__v4sf) (__m128) (B), \
+ (int) (I), \
+ (__v4sf) _mm_setzero_ps (), \
+ (__mmask8) (-1), \
+ (int) (R)))
+#define _mm_mask_roundscale_round_ss(A, U, B, C, I, R) \
+ ((__m128) \
+ __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (B), \
+ (__v4sf) (__m128) (C), \
+ (int) (I), \
+ (__v4sf) (__m128) (A), \
+ (__mmask8) (U), \
+ (int) (R)))
+#define _mm_maskz_roundscale_round_ss(U, A, B, I, R) \
+ ((__m128) \
+ __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (A), \
+ (__v4sf) (__m128) (B), \
+ (int) (I), \
+ (__v4sf) _mm_setzero_ps (), \
+ (__mmask8) (U), \
+ (int) (R)))
+#define _mm_roundscale_round_sd(A, B, I, R) \
+ ((__m128d) \
+ __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (A), \
+ (__v2df) (__m128d) (B), \
+ (int) (I), \
+ (__v2df) _mm_setzero_pd (), \
+ (__mmask8) (-1), \
+ (int) (R)))
+#define _mm_mask_roundscale_round_sd(A, U, B, C, I, R) \
+ ((__m128d) \
+ __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (B), \
+ (__v2df) (__m128d) (C), \
+ (int) (I), \
+ (__v2df) (__m128d) (A), \
+ (__mmask8) (U), \
+ (int) (R)))
+#define _mm_maskz_roundscale_round_sd(U, A, B, I, R) \
+ ((__m128d) \
+ __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (A), \
+ (__v2df) (__m128d) (B), \
+ (int) (I), \
+ (__v2df) _mm_setzero_pd (), \
+ (__mmask8) (U), \
+ (int) (R)))
+#endif
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_floor_ps (__m512 __A)
+{
+ return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
+ _MM_FROUND_FLOOR,
+ (__v16sf) __A, -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_floor_pd (__m512d __A)
+{
+ return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
+ _MM_FROUND_FLOOR,
+ (__v8df) __A, -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_ceil_ps (__m512 __A)
+{
+ return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
+ _MM_FROUND_CEIL,
+ (__v16sf) __A, -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_ceil_pd (__m512d __A)
+{
+ return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
+ _MM_FROUND_CEIL,
+ (__v8df) __A, -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
+ _MM_FROUND_FLOOR,
+ (__v16sf) __W, __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+ return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
+ _MM_FROUND_FLOOR,
+ (__v8df) __W, __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
+ _MM_FROUND_CEIL,
+ (__v16sf) __W, __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+ return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
+ _MM_FROUND_CEIL,
+ (__v8df) __W, __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_alignr_epi32 (__m512i __A, __m512i __B, const int __imm)
+{
+ return (__m512i) __builtin_ia32_alignd512_mask ((__v16si) __A,
+ (__v16si) __B, __imm,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_alignr_epi32 (__m512i __W, __mmask16 __U, __m512i __A,
+ __m512i __B, const int __imm)
+{
+ return (__m512i) __builtin_ia32_alignd512_mask ((__v16si) __A,
+ (__v16si) __B, __imm,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_alignr_epi32 (__mmask16 __U, __m512i __A, __m512i __B,
+ const int __imm)
+{
+ return (__m512i) __builtin_ia32_alignd512_mask ((__v16si) __A,
+ (__v16si) __B, __imm,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_alignr_epi64 (__m512i __A, __m512i __B, const int __imm)
+{
+ return (__m512i) __builtin_ia32_alignq512_mask ((__v8di) __A,
+ (__v8di) __B, __imm,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_alignr_epi64 (__m512i __W, __mmask8 __U, __m512i __A,
+ __m512i __B, const int __imm)
+{
+ return (__m512i) __builtin_ia32_alignq512_mask ((__v8di) __A,
+ (__v8di) __B, __imm,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_alignr_epi64 (__mmask8 __U, __m512i __A, __m512i __B,
+ const int __imm)
+{
+ return (__m512i) __builtin_ia32_alignq512_mask ((__v8di) __A,
+ (__v8di) __B, __imm,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+#else
+#define _mm512_alignr_epi32(X, Y, C) \
+ ((__m512i)__builtin_ia32_alignd512_mask ((__v16si)(__m512i)(X), \
+ (__v16si)(__m512i)(Y), (int)(C), (__v16si)_mm512_undefined_epi32 (),\
+ (__mmask16)-1))
+
+#define _mm512_mask_alignr_epi32(W, U, X, Y, C) \
+ ((__m512i)__builtin_ia32_alignd512_mask ((__v16si)(__m512i)(X), \
+ (__v16si)(__m512i)(Y), (int)(C), (__v16si)(__m512i)(W), \
+ (__mmask16)(U)))
+
+#define _mm512_maskz_alignr_epi32(U, X, Y, C) \
+ ((__m512i)__builtin_ia32_alignd512_mask ((__v16si)(__m512i)(X), \
+ (__v16si)(__m512i)(Y), (int)(C), (__v16si)_mm512_setzero_si512 (),\
+ (__mmask16)(U)))
+
+#define _mm512_alignr_epi64(X, Y, C) \
+ ((__m512i)__builtin_ia32_alignq512_mask ((__v8di)(__m512i)(X), \
+ (__v8di)(__m512i)(Y), (int)(C), (__v8di)_mm512_undefined_epi32 (), \
+ (__mmask8)-1))
+
+#define _mm512_mask_alignr_epi64(W, U, X, Y, C) \
+ ((__m512i)__builtin_ia32_alignq512_mask ((__v8di)(__m512i)(X), \
+ (__v8di)(__m512i)(Y), (int)(C), (__v8di)(__m512i)(W), (__mmask8)(U)))
+
+#define _mm512_maskz_alignr_epi64(U, X, Y, C) \
+ ((__m512i)__builtin_ia32_alignq512_mask ((__v8di)(__m512i)(X), \
+ (__v8di)(__m512i)(Y), (int)(C), (__v8di)_mm512_setzero_si512 (),\
+ (__mmask8)(U)))
+#endif
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpeq_epi32_mask (__m512i __A, __m512i __B)
+{
+ return (__mmask16) __builtin_ia32_pcmpeqd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpeq_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__mmask16) __builtin_ia32_pcmpeqd512_mask ((__v16si) __A,
+ (__v16si) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpeq_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__mmask8) __builtin_ia32_pcmpeqq512_mask ((__v8di) __A,
+ (__v8di) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpeq_epi64_mask (__m512i __A, __m512i __B)
+{
+ return (__mmask8) __builtin_ia32_pcmpeqq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpgt_epi32_mask (__m512i __A, __m512i __B)
+{
+ return (__mmask16) __builtin_ia32_pcmpgtd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpgt_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__mmask16) __builtin_ia32_pcmpgtd512_mask ((__v16si) __A,
+ (__v16si) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpgt_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__mmask8) __builtin_ia32_pcmpgtq512_mask ((__v8di) __A,
+ (__v8di) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpgt_epi64_mask (__m512i __A, __m512i __B)
+{
+ return (__mmask8) __builtin_ia32_pcmpgtq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpge_epi32_mask (__m512i __X, __m512i __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
+ (__v16si) __Y, 5,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpge_epi32_mask (__mmask16 __M, __m512i __X, __m512i __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
+ (__v16si) __Y, 5,
+ (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpge_epu32_mask (__mmask16 __M, __m512i __X, __m512i __Y)
+{
+ return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
+ (__v16si) __Y, 5,
+ (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpge_epu32_mask (__m512i __X, __m512i __Y)
+{
+ return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
+ (__v16si) __Y, 5,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpge_epi64_mask (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
+ (__v8di) __Y, 5,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpge_epi64_mask (__m512i __X, __m512i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
+ (__v8di) __Y, 5,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpge_epu64_mask (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
+ (__v8di) __Y, 5,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpge_epu64_mask (__m512i __X, __m512i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
+ (__v8di) __Y, 5,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmple_epi32_mask (__mmask16 __M, __m512i __X, __m512i __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
+ (__v16si) __Y, 2,
+ (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmple_epi32_mask (__m512i __X, __m512i __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
+ (__v16si) __Y, 2,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmple_epu32_mask (__mmask16 __M, __m512i __X, __m512i __Y)
+{
+ return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
+ (__v16si) __Y, 2,
+ (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmple_epu32_mask (__m512i __X, __m512i __Y)
+{
+ return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
+ (__v16si) __Y, 2,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmple_epi64_mask (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
+ (__v8di) __Y, 2,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmple_epi64_mask (__m512i __X, __m512i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
+ (__v8di) __Y, 2,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmple_epu64_mask (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
+ (__v8di) __Y, 2,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmple_epu64_mask (__m512i __X, __m512i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
+ (__v8di) __Y, 2,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmplt_epi32_mask (__mmask16 __M, __m512i __X, __m512i __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
+ (__v16si) __Y, 1,
+ (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmplt_epi32_mask (__m512i __X, __m512i __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
+ (__v16si) __Y, 1,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmplt_epu32_mask (__mmask16 __M, __m512i __X, __m512i __Y)
+{
+ return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
+ (__v16si) __Y, 1,
+ (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmplt_epu32_mask (__m512i __X, __m512i __Y)
+{
+ return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
+ (__v16si) __Y, 1,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmplt_epi64_mask (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
+ (__v8di) __Y, 1,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmplt_epi64_mask (__m512i __X, __m512i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
+ (__v8di) __Y, 1,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmplt_epu64_mask (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
+ (__v8di) __Y, 1,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmplt_epu64_mask (__m512i __X, __m512i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
+ (__v8di) __Y, 1,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpneq_epi32_mask (__m512i __X, __m512i __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
+ (__v16si) __Y, 4,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpneq_epi32_mask (__mmask16 __M, __m512i __X, __m512i __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
+ (__v16si) __Y, 4,
+ (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpneq_epu32_mask (__mmask16 __M, __m512i __X, __m512i __Y)
+{
+ return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
+ (__v16si) __Y, 4,
+ (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpneq_epu32_mask (__m512i __X, __m512i __Y)
+{
+ return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
+ (__v16si) __Y, 4,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpneq_epi64_mask (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
+ (__v8di) __Y, 4,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpneq_epi64_mask (__m512i __X, __m512i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
+ (__v8di) __Y, 4,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpneq_epu64_mask (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
+ (__v8di) __Y, 4,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpneq_epu64_mask (__m512i __X, __m512i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
+ (__v8di) __Y, 4,
+ (__mmask8) -1);
+}
+
+#define _MM_CMPINT_EQ 0x0
+#define _MM_CMPINT_LT 0x1
+#define _MM_CMPINT_LE 0x2
+#define _MM_CMPINT_UNUSED 0x3
+#define _MM_CMPINT_NE 0x4
+#define _MM_CMPINT_NLT 0x5
+#define _MM_CMPINT_GE 0x5
+#define _MM_CMPINT_NLE 0x6
+#define _MM_CMPINT_GT 0x6
+
+#ifdef __OPTIMIZE__
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kshiftli_mask16 (__mmask16 __A, unsigned int __B)
+{
+ return (__mmask16) __builtin_ia32_kshiftlihi ((__mmask16) __A,
+ (__mmask8) __B);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kshiftri_mask16 (__mmask16 __A, unsigned int __B)
+{
+ return (__mmask16) __builtin_ia32_kshiftrihi ((__mmask16) __A,
+ (__mmask8) __B);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_epi64_mask (__m512i __X, __m512i __Y, const int __P)
+{
+ return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
+ (__v8di) __Y, __P,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_epi32_mask (__m512i __X, __m512i __Y, const int __P)
+{
+ return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
+ (__v16si) __Y, __P,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_epu64_mask (__m512i __X, __m512i __Y, const int __P)
+{
+ return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
+ (__v8di) __Y, __P,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_epu32_mask (__m512i __X, __m512i __Y, const int __P)
+{
+ return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
+ (__v16si) __Y, __P,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_round_pd_mask (__m512d __X, __m512d __Y, const int __P,
+ const int __R)
+{
+ return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+ (__v8df) __Y, __P,
+ (__mmask8) -1, __R);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_round_ps_mask (__m512 __X, __m512 __Y, const int __P, const int __R)
+{
+ return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+ (__v16sf) __Y, __P,
+ (__mmask16) -1, __R);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_epi64_mask (__mmask8 __U, __m512i __X, __m512i __Y,
+ const int __P)
+{
+ return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
+ (__v8di) __Y, __P,
+ (__mmask8) __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_epi32_mask (__mmask16 __U, __m512i __X, __m512i __Y,
+ const int __P)
+{
+ return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
+ (__v16si) __Y, __P,
+ (__mmask16) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_epu64_mask (__mmask8 __U, __m512i __X, __m512i __Y,
+ const int __P)
+{
+ return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
+ (__v8di) __Y, __P,
+ (__mmask8) __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_epu32_mask (__mmask16 __U, __m512i __X, __m512i __Y,
+ const int __P)
+{
+ return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
+ (__v16si) __Y, __P,
+ (__mmask16) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_round_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y,
+ const int __P, const int __R)
+{
+ return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+ (__v8df) __Y, __P,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_round_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y,
+ const int __P, const int __R)
+{
+ return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+ (__v16sf) __Y, __P,
+ (__mmask16) __U, __R);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_round_sd_mask (__m128d __X, __m128d __Y, const int __P, const int __R)
+{
+ return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X,
+ (__v2df) __Y, __P,
+ (__mmask8) -1, __R);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_round_sd_mask (__mmask8 __M, __m128d __X, __m128d __Y,
+ const int __P, const int __R)
+{
+ return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X,
+ (__v2df) __Y, __P,
+ (__mmask8) __M, __R);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_round_ss_mask (__m128 __X, __m128 __Y, const int __P, const int __R)
+{
+ return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X,
+ (__v4sf) __Y, __P,
+ (__mmask8) -1, __R);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_round_ss_mask (__mmask8 __M, __m128 __X, __m128 __Y,
+ const int __P, const int __R)
+{
+ return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X,
+ (__v4sf) __Y, __P,
+ (__mmask8) __M, __R);
+}
+
+#else
+#define _kshiftli_mask16(X, Y) \
+ ((__mmask16) __builtin_ia32_kshiftlihi ((__mmask16)(X), (__mmask8)(Y)))
+
+#define _kshiftri_mask16(X, Y) \
+ ((__mmask16) __builtin_ia32_kshiftrihi ((__mmask16)(X), (__mmask8)(Y)))
+
+#define _mm512_cmp_epi64_mask(X, Y, P) \
+ ((__mmask8) __builtin_ia32_cmpq512_mask ((__v8di)(__m512i)(X), \
+ (__v8di)(__m512i)(Y), (int)(P),\
+ (__mmask8)-1))
+
+#define _mm512_cmp_epi32_mask(X, Y, P) \
+ ((__mmask16) __builtin_ia32_cmpd512_mask ((__v16si)(__m512i)(X), \
+ (__v16si)(__m512i)(Y), (int)(P), \
+ (__mmask16)-1))
+
+#define _mm512_cmp_epu64_mask(X, Y, P) \
+ ((__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di)(__m512i)(X), \
+ (__v8di)(__m512i)(Y), (int)(P),\
+ (__mmask8)-1))
+
+#define _mm512_cmp_epu32_mask(X, Y, P) \
+ ((__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si)(__m512i)(X), \
+ (__v16si)(__m512i)(Y), (int)(P), \
+ (__mmask16)-1))
+
+#define _mm512_cmp_round_pd_mask(X, Y, P, R) \
+ ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X), \
+ (__v8df)(__m512d)(Y), (int)(P),\
+ (__mmask8)-1, R))
+
+#define _mm512_cmp_round_ps_mask(X, Y, P, R) \
+ ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X), \
+ (__v16sf)(__m512)(Y), (int)(P),\
+ (__mmask16)-1, R))
+
+#define _mm512_mask_cmp_epi64_mask(M, X, Y, P) \
+ ((__mmask8) __builtin_ia32_cmpq512_mask ((__v8di)(__m512i)(X), \
+ (__v8di)(__m512i)(Y), (int)(P),\
+ (__mmask8)(M)))
+
+#define _mm512_mask_cmp_epi32_mask(M, X, Y, P) \
+ ((__mmask16) __builtin_ia32_cmpd512_mask ((__v16si)(__m512i)(X), \
+ (__v16si)(__m512i)(Y), (int)(P), \
+ (__mmask16)(M)))
+
+#define _mm512_mask_cmp_epu64_mask(M, X, Y, P) \
+ ((__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di)(__m512i)(X), \
+ (__v8di)(__m512i)(Y), (int)(P),\
+ (__mmask8)(M)))
+
+#define _mm512_mask_cmp_epu32_mask(M, X, Y, P) \
+ ((__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si)(__m512i)(X), \
+ (__v16si)(__m512i)(Y), (int)(P), \
+ (__mmask16)(M)))
+
+#define _mm512_mask_cmp_round_pd_mask(M, X, Y, P, R) \
+ ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X), \
+ (__v8df)(__m512d)(Y), (int)(P),\
+ (__mmask8)(M), R))
+
+#define _mm512_mask_cmp_round_ps_mask(M, X, Y, P, R) \
+ ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X), \
+ (__v16sf)(__m512)(Y), (int)(P),\
+ (__mmask16)(M), R))
+
+#define _mm_cmp_round_sd_mask(X, Y, P, R) \
+ ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), (int)(P),\
+ (__mmask8)-1, R))
+
+#define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) \
+ ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), (int)(P),\
+ (M), R))
+
+#define _mm_cmp_round_ss_mask(X, Y, P, R) \
+ ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), (int)(P), \
+ (__mmask8)-1, R))
+
+#define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) \
+ ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), (int)(P), \
+ (M), R))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i32gather_ps (__m512i __index, void const *__addr, int __scale)
+{
+ __m512 __v1_old = _mm512_undefined_ps ();
+ __mmask16 __mask = 0xFFFF;
+
+ return (__m512) __builtin_ia32_gathersiv16sf ((__v16sf) __v1_old,
+ __addr,
+ (__v16si) __index,
+ __mask, __scale);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i32gather_ps (__m512 __v1_old, __mmask16 __mask,
+ __m512i __index, void const *__addr, int __scale)
+{
+ return (__m512) __builtin_ia32_gathersiv16sf ((__v16sf) __v1_old,
+ __addr,
+ (__v16si) __index,
+ __mask, __scale);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i32gather_pd (__m256i __index, void const *__addr, int __scale)
+{
+ __m512d __v1_old = _mm512_undefined_pd ();
+ __mmask8 __mask = 0xFF;
+
+ return (__m512d) __builtin_ia32_gathersiv8df ((__v8df) __v1_old,
+ __addr,
+ (__v8si) __index, __mask,
+ __scale);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i32gather_pd (__m512d __v1_old, __mmask8 __mask,
+ __m256i __index, void const *__addr, int __scale)
+{
+ return (__m512d) __builtin_ia32_gathersiv8df ((__v8df) __v1_old,
+ __addr,
+ (__v8si) __index,
+ __mask, __scale);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i64gather_ps (__m512i __index, void const *__addr, int __scale)
+{
+ __m256 __v1_old = _mm256_undefined_ps ();
+ __mmask8 __mask = 0xFF;
+
+ return (__m256) __builtin_ia32_gatherdiv16sf ((__v8sf) __v1_old,
+ __addr,
+ (__v8di) __index, __mask,
+ __scale);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i64gather_ps (__m256 __v1_old, __mmask8 __mask,
+ __m512i __index, void const *__addr, int __scale)
+{
+ return (__m256) __builtin_ia32_gatherdiv16sf ((__v8sf) __v1_old,
+ __addr,
+ (__v8di) __index,
+ __mask, __scale);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i64gather_pd (__m512i __index, void const *__addr, int __scale)
+{
+ __m512d __v1_old = _mm512_undefined_pd ();
+ __mmask8 __mask = 0xFF;
+
+ return (__m512d) __builtin_ia32_gatherdiv8df ((__v8df) __v1_old,
+ __addr,
+ (__v8di) __index, __mask,
+ __scale);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i64gather_pd (__m512d __v1_old, __mmask8 __mask,
+ __m512i __index, void const *__addr, int __scale)
+{
+ return (__m512d) __builtin_ia32_gatherdiv8df ((__v8df) __v1_old,
+ __addr,
+ (__v8di) __index,
+ __mask, __scale);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i32gather_epi32 (__m512i __index, void const *__addr, int __scale)
+{
+ __m512i __v1_old = _mm512_undefined_epi32 ();
+ __mmask16 __mask = 0xFFFF;
+
+ return (__m512i) __builtin_ia32_gathersiv16si ((__v16si) __v1_old,
+ __addr,
+ (__v16si) __index,
+ __mask, __scale);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i32gather_epi32 (__m512i __v1_old, __mmask16 __mask,
+ __m512i __index, void const *__addr, int __scale)
+{
+ return (__m512i) __builtin_ia32_gathersiv16si ((__v16si) __v1_old,
+ __addr,
+ (__v16si) __index,
+ __mask, __scale);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i32gather_epi64 (__m256i __index, void const *__addr, int __scale)
+{
+ __m512i __v1_old = _mm512_undefined_epi32 ();
+ __mmask8 __mask = 0xFF;
+
+ return (__m512i) __builtin_ia32_gathersiv8di ((__v8di) __v1_old,
+ __addr,
+ (__v8si) __index, __mask,
+ __scale);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i32gather_epi64 (__m512i __v1_old, __mmask8 __mask,
+ __m256i __index, void const *__addr,
+ int __scale)
+{
+ return (__m512i) __builtin_ia32_gathersiv8di ((__v8di) __v1_old,
+ __addr,
+ (__v8si) __index,
+ __mask, __scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i64gather_epi32 (__m512i __index, void const *__addr, int __scale)
+{
+ __m256i __v1_old = _mm256_undefined_si256 ();
+ __mmask8 __mask = 0xFF;
+
+ return (__m256i) __builtin_ia32_gatherdiv16si ((__v8si) __v1_old,
+ __addr,
+ (__v8di) __index,
+ __mask, __scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i64gather_epi32 (__m256i __v1_old, __mmask8 __mask,
+ __m512i __index, void const *__addr, int __scale)
+{
+ return (__m256i) __builtin_ia32_gatherdiv16si ((__v8si) __v1_old,
+ __addr,
+ (__v8di) __index,
+ __mask, __scale);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i64gather_epi64 (__m512i __index, void const *__addr, int __scale)
+{
+ __m512i __v1_old = _mm512_undefined_epi32 ();
+ __mmask8 __mask = 0xFF;
+
+ return (__m512i) __builtin_ia32_gatherdiv8di ((__v8di) __v1_old,
+ __addr,
+ (__v8di) __index, __mask,
+ __scale);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i64gather_epi64 (__m512i __v1_old, __mmask8 __mask,
+ __m512i __index, void const *__addr,
+ int __scale)
+{
+ return (__m512i) __builtin_ia32_gatherdiv8di ((__v8di) __v1_old,
+ __addr,
+ (__v8di) __index,
+ __mask, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i32scatter_ps (void *__addr, __m512i __index, __m512 __v1, int __scale)
+{
+ __builtin_ia32_scattersiv16sf (__addr, (__mmask16) 0xFFFF,
+ (__v16si) __index, (__v16sf) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i32scatter_ps (void *__addr, __mmask16 __mask,
+ __m512i __index, __m512 __v1, int __scale)
+{
+ __builtin_ia32_scattersiv16sf (__addr, __mask, (__v16si) __index,
+ (__v16sf) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i32scatter_pd (void *__addr, __m256i __index, __m512d __v1,
+ int __scale)
+{
+ __builtin_ia32_scattersiv8df (__addr, (__mmask8) 0xFF,
+ (__v8si) __index, (__v8df) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i32scatter_pd (void *__addr, __mmask8 __mask,
+ __m256i __index, __m512d __v1, int __scale)
+{
+ __builtin_ia32_scattersiv8df (__addr, __mask, (__v8si) __index,
+ (__v8df) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i64scatter_ps (void *__addr, __m512i __index, __m256 __v1, int __scale)
+{
+ __builtin_ia32_scatterdiv16sf (__addr, (__mmask8) 0xFF,
+ (__v8di) __index, (__v8sf) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i64scatter_ps (void *__addr, __mmask8 __mask,
+ __m512i __index, __m256 __v1, int __scale)
+{
+ __builtin_ia32_scatterdiv16sf (__addr, __mask, (__v8di) __index,
+ (__v8sf) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i64scatter_pd (void *__addr, __m512i __index, __m512d __v1,
+ int __scale)
+{
+ __builtin_ia32_scatterdiv8df (__addr, (__mmask8) 0xFF,
+ (__v8di) __index, (__v8df) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i64scatter_pd (void *__addr, __mmask8 __mask,
+ __m512i __index, __m512d __v1, int __scale)
+{
+ __builtin_ia32_scatterdiv8df (__addr, __mask, (__v8di) __index,
+ (__v8df) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i32scatter_epi32 (void *__addr, __m512i __index,
+ __m512i __v1, int __scale)
+{
+ __builtin_ia32_scattersiv16si (__addr, (__mmask16) 0xFFFF,
+ (__v16si) __index, (__v16si) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i32scatter_epi32 (void *__addr, __mmask16 __mask,
+ __m512i __index, __m512i __v1, int __scale)
+{
+ __builtin_ia32_scattersiv16si (__addr, __mask, (__v16si) __index,
+ (__v16si) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i32scatter_epi64 (void *__addr, __m256i __index,
+ __m512i __v1, int __scale)
+{
+ __builtin_ia32_scattersiv8di (__addr, (__mmask8) 0xFF,
+ (__v8si) __index, (__v8di) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i32scatter_epi64 (void *__addr, __mmask8 __mask,
+ __m256i __index, __m512i __v1, int __scale)
+{
+ __builtin_ia32_scattersiv8di (__addr, __mask, (__v8si) __index,
+ (__v8di) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i64scatter_epi32 (void *__addr, __m512i __index,
+ __m256i __v1, int __scale)
+{
+ __builtin_ia32_scatterdiv16si (__addr, (__mmask8) 0xFF,
+ (__v8di) __index, (__v8si) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i64scatter_epi32 (void *__addr, __mmask8 __mask,
+ __m512i __index, __m256i __v1, int __scale)
+{
+ __builtin_ia32_scatterdiv16si (__addr, __mask, (__v8di) __index,
+ (__v8si) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i64scatter_epi64 (void *__addr, __m512i __index,
+ __m512i __v1, int __scale)
+{
+ __builtin_ia32_scatterdiv8di (__addr, (__mmask8) 0xFF,
+ (__v8di) __index, (__v8di) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i64scatter_epi64 (void *__addr, __mmask8 __mask,
+ __m512i __index, __m512i __v1, int __scale)
+{
+ __builtin_ia32_scatterdiv8di (__addr, __mask, (__v8di) __index,
+ (__v8di) __v1, __scale);
+}
+#else
+#define _mm512_i32gather_ps(INDEX, ADDR, SCALE) \
+ (__m512) __builtin_ia32_gathersiv16sf ((__v16sf)_mm512_undefined_ps(),\
+ (void const *) (ADDR), \
+ (__v16si)(__m512i) (INDEX), \
+ (__mmask16)0xFFFF, \
+ (int) (SCALE))
+
+#define _mm512_mask_i32gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) \
+ (__m512) __builtin_ia32_gathersiv16sf ((__v16sf)(__m512) (V1OLD), \
+ (void const *) (ADDR), \
+ (__v16si)(__m512i) (INDEX), \
+ (__mmask16) (MASK), \
+ (int) (SCALE))
+
+#define _mm512_i32gather_pd(INDEX, ADDR, SCALE) \
+ (__m512d) __builtin_ia32_gathersiv8df ((__v8df)_mm512_undefined_pd(), \
+ (void const *) (ADDR), \
+ (__v8si)(__m256i) (INDEX), \
+ (__mmask8)0xFF, (int) (SCALE))
+
+#define _mm512_mask_i32gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) \
+ (__m512d) __builtin_ia32_gathersiv8df ((__v8df)(__m512d) (V1OLD), \
+ (void const *) (ADDR), \
+ (__v8si)(__m256i) (INDEX), \
+ (__mmask8) (MASK), \
+ (int) (SCALE))
+
+#define _mm512_i64gather_ps(INDEX, ADDR, SCALE) \
+ (__m256) __builtin_ia32_gatherdiv16sf ((__v8sf)_mm256_undefined_ps(), \
+ (void const *) (ADDR), \
+ (__v8di)(__m512i) (INDEX), \
+ (__mmask8)0xFF, (int) (SCALE))
+
+#define _mm512_mask_i64gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) \
+ (__m256) __builtin_ia32_gatherdiv16sf ((__v8sf)(__m256) (V1OLD), \
+ (void const *) (ADDR), \
+ (__v8di)(__m512i) (INDEX), \
+ (__mmask8) (MASK), \
+ (int) (SCALE))
+
+#define _mm512_i64gather_pd(INDEX, ADDR, SCALE) \
+ (__m512d) __builtin_ia32_gatherdiv8df ((__v8df)_mm512_undefined_pd(), \
+ (void const *) (ADDR), \
+ (__v8di)(__m512i) (INDEX), \
+ (__mmask8)0xFF, (int) (SCALE))
+
+#define _mm512_mask_i64gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) \
+ (__m512d) __builtin_ia32_gatherdiv8df ((__v8df)(__m512d) (V1OLD), \
+ (void const *) (ADDR), \
+ (__v8di)(__m512i) (INDEX), \
+ (__mmask8) (MASK), \
+ (int) (SCALE))
+
+#define _mm512_i32gather_epi32(INDEX, ADDR, SCALE) \
+ (__m512i) __builtin_ia32_gathersiv16si ((__v16si)_mm512_undefined_epi32 (),\
+ (void const *) (ADDR), \
+ (__v16si)(__m512i) (INDEX), \
+ (__mmask16)0xFFFF, \
+ (int) (SCALE))
+
+#define _mm512_mask_i32gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) \
+ (__m512i) __builtin_ia32_gathersiv16si ((__v16si)(__m512i) (V1OLD), \
+ (void const *) (ADDR), \
+ (__v16si)(__m512i) (INDEX), \
+ (__mmask16) (MASK), \
+ (int) (SCALE))
+
+#define _mm512_i32gather_epi64(INDEX, ADDR, SCALE) \
+ (__m512i) __builtin_ia32_gathersiv8di ((__v8di)_mm512_undefined_epi32 (),\
+ (void const *) (ADDR), \
+ (__v8si)(__m256i) (INDEX), \
+ (__mmask8)0xFF, (int) (SCALE))
+
+#define _mm512_mask_i32gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) \
+ (__m512i) __builtin_ia32_gathersiv8di ((__v8di)(__m512i) (V1OLD), \
+ (void const *) (ADDR), \
+ (__v8si)(__m256i) (INDEX), \
+ (__mmask8) (MASK), \
+ (int) (SCALE))
+
+#define _mm512_i64gather_epi32(INDEX, ADDR, SCALE) \
+ (__m256i) __builtin_ia32_gatherdiv16si ((__v8si)_mm256_undefined_si256(),\
+ (void const *) (ADDR), \
+ (__v8di)(__m512i) (INDEX), \
+ (__mmask8)0xFF, (int) (SCALE))
+
+#define _mm512_mask_i64gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) \
+ (__m256i) __builtin_ia32_gatherdiv16si ((__v8si)(__m256i) (V1OLD), \
+ (void const *) (ADDR), \
+ (__v8di)(__m512i) (INDEX), \
+ (__mmask8) (MASK), \
+ (int) (SCALE))
+
+#define _mm512_i64gather_epi64(INDEX, ADDR, SCALE) \
+ (__m512i) __builtin_ia32_gatherdiv8di ((__v8di)_mm512_undefined_epi32 (),\
+ (void const *) (ADDR), \
+ (__v8di)(__m512i) (INDEX), \
+ (__mmask8)0xFF, (int) (SCALE))
+
+#define _mm512_mask_i64gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) \
+ (__m512i) __builtin_ia32_gatherdiv8di ((__v8di)(__m512i) (V1OLD), \
+ (void const *) (ADDR), \
+ (__v8di)(__m512i) (INDEX), \
+ (__mmask8) (MASK), \
+ (int) (SCALE))
+
+#define _mm512_i32scatter_ps(ADDR, INDEX, V1, SCALE) \
+ __builtin_ia32_scattersiv16sf ((void *) (ADDR), (__mmask16)0xFFFF, \
+ (__v16si)(__m512i) (INDEX), \
+ (__v16sf)(__m512) (V1), (int) (SCALE))
+
+#define _mm512_mask_i32scatter_ps(ADDR, MASK, INDEX, V1, SCALE) \
+ __builtin_ia32_scattersiv16sf ((void *) (ADDR), (__mmask16) (MASK), \
+ (__v16si)(__m512i) (INDEX), \
+ (__v16sf)(__m512) (V1), (int) (SCALE))
+
+#define _mm512_i32scatter_pd(ADDR, INDEX, V1, SCALE) \
+ __builtin_ia32_scattersiv8df ((void *) (ADDR), (__mmask8)0xFF, \
+ (__v8si)(__m256i) (INDEX), \
+ (__v8df)(__m512d) (V1), (int) (SCALE))
+
+#define _mm512_mask_i32scatter_pd(ADDR, MASK, INDEX, V1, SCALE) \
+ __builtin_ia32_scattersiv8df ((void *) (ADDR), (__mmask8) (MASK), \
+ (__v8si)(__m256i) (INDEX), \
+ (__v8df)(__m512d) (V1), (int) (SCALE))
+
+#define _mm512_i64scatter_ps(ADDR, INDEX, V1, SCALE) \
+ __builtin_ia32_scatterdiv16sf ((void *) (ADDR), (__mmask8)0xFF, \
+ (__v8di)(__m512i) (INDEX), \
+ (__v8sf)(__m256) (V1), (int) (SCALE))
+
+#define _mm512_mask_i64scatter_ps(ADDR, MASK, INDEX, V1, SCALE) \
+ __builtin_ia32_scatterdiv16sf ((void *) (ADDR), (__mmask16) (MASK), \
+ (__v8di)(__m512i) (INDEX), \
+ (__v8sf)(__m256) (V1), (int) (SCALE))
+
+#define _mm512_i64scatter_pd(ADDR, INDEX, V1, SCALE) \
+ __builtin_ia32_scatterdiv8df ((void *) (ADDR), (__mmask8)0xFF, \
+ (__v8di)(__m512i) (INDEX), \
+ (__v8df)(__m512d) (V1), (int) (SCALE))
+
+#define _mm512_mask_i64scatter_pd(ADDR, MASK, INDEX, V1, SCALE) \
+ __builtin_ia32_scatterdiv8df ((void *) (ADDR), (__mmask8) (MASK), \
+ (__v8di)(__m512i) (INDEX), \
+ (__v8df)(__m512d) (V1), (int) (SCALE))
+
+#define _mm512_i32scatter_epi32(ADDR, INDEX, V1, SCALE) \
+ __builtin_ia32_scattersiv16si ((void *) (ADDR), (__mmask16)0xFFFF, \
+ (__v16si)(__m512i) (INDEX), \
+ (__v16si)(__m512i) (V1), (int) (SCALE))
+
+#define _mm512_mask_i32scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) \
+ __builtin_ia32_scattersiv16si ((void *) (ADDR), (__mmask16) (MASK), \
+ (__v16si)(__m512i) (INDEX), \
+ (__v16si)(__m512i) (V1), (int) (SCALE))
+
+#define _mm512_i32scatter_epi64(ADDR, INDEX, V1, SCALE) \
+ __builtin_ia32_scattersiv8di ((void *) (ADDR), (__mmask8)0xFF, \
+ (__v8si)(__m256i) (INDEX), \
+ (__v8di)(__m512i) (V1), (int) (SCALE))
+
+#define _mm512_mask_i32scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) \
+ __builtin_ia32_scattersiv8di ((void *) (ADDR), (__mmask8) (MASK), \
+ (__v8si)(__m256i) (INDEX), \
+ (__v8di)(__m512i) (V1), (int) (SCALE))
+
+#define _mm512_i64scatter_epi32(ADDR, INDEX, V1, SCALE) \
+ __builtin_ia32_scatterdiv16si ((void *) (ADDR), (__mmask8)0xFF, \
+ (__v8di)(__m512i) (INDEX), \
+ (__v8si)(__m256i) (V1), (int) (SCALE))
+
+#define _mm512_mask_i64scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) \
+ __builtin_ia32_scatterdiv16si ((void *) (ADDR), (__mmask8) (MASK), \
+ (__v8di)(__m512i) (INDEX), \
+ (__v8si)(__m256i) (V1), (int) (SCALE))
+
+#define _mm512_i64scatter_epi64(ADDR, INDEX, V1, SCALE) \
+ __builtin_ia32_scatterdiv8di ((void *) (ADDR), (__mmask8)0xFF, \
+ (__v8di)(__m512i) (INDEX), \
+ (__v8di)(__m512i) (V1), (int) (SCALE))
+
+#define _mm512_mask_i64scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) \
+ __builtin_ia32_scatterdiv8di ((void *) (ADDR), (__mmask8) (MASK), \
+ (__v8di)(__m512i) (INDEX), \
+ (__v8di)(__m512i) (V1), (int) (SCALE))
+#endif
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_compress_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+ return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_compress_pd (__mmask8 __U, __m512d __A)
+{
+ return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m512d __A)
+{
+ __builtin_ia32_compressstoredf512_mask ((__v8df *) __P, (__v8df) __A,
+ (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_compress_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_compress_ps (__mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_compressstoreu_ps (void *__P, __mmask16 __U, __m512 __A)
+{
+ __builtin_ia32_compressstoresf512_mask ((__v16sf *) __P, (__v16sf) __A,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_compress_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_compress_epi64 (__mmask8 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m512i __A)
+{
+ __builtin_ia32_compressstoredi512_mask ((__v8di *) __P, (__v8di) __A,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_compress_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A)
+{
+ __builtin_ia32_compressstoresi512_mask ((__v16si *) __P, (__v16si) __A,
+ (__mmask16) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+ return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_expand_pd (__mmask8 __U, __m512d __A)
+{
+ return (__m512d) __builtin_ia32_expanddf512_maskz ((__v8df) __A,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_expandloadu_pd (__m512d __W, __mmask8 __U, void const *__P)
+{
+ return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *) __P,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_expandloadu_pd (__mmask8 __U, void const *__P)
+{
+ return (__m512d) __builtin_ia32_expandloaddf512_maskz ((const __v8df *) __P,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_expand_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_expand_ps (__mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_expandsf512_maskz ((__v16sf) __A,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_expandloadu_ps (__m512 __W, __mmask16 __U, void const *__P)
+{
+ return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *) __P,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_expandloadu_ps (__mmask16 __U, void const *__P)
+{
+ return (__m512) __builtin_ia32_expandloadsf512_maskz ((const __v16sf *) __P,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_expand_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_expand_epi64 (__mmask8 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_expanddi512_maskz ((__v8di) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_expandloadu_epi64 (__m512i __W, __mmask8 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *) __P,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P)
+{
+ return (__m512i)
+ __builtin_ia32_expandloaddi512_maskz ((const __v8di *) __P,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_expand_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_expandsi512_maskz ((__v16si) __A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_expandloadu_epi32 (__m512i __W, __mmask16 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *) __P,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_expandloadu_epi32 (__mmask16 __U, void const *__P)
+{
+ return (__m512i) __builtin_ia32_expandloadsi512_maskz ((const __v16si *) __P,
+ (__v16si)
+ _mm512_setzero_si512
+ (), (__mmask16) __U);
+}
+
+/* Mask arithmetic operations */
+#define _kand_mask16 _mm512_kand
+#define _kandn_mask16 _mm512_kandn
+#define _knot_mask16 _mm512_knot
+#define _kor_mask16 _mm512_kor
+#define _kxnor_mask16 _mm512_kxnor
+#define _kxor_mask16 _mm512_kxor
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kortest_mask16_u8 (__mmask16 __A, __mmask16 __B, unsigned char *__CF)
+{
+ *__CF = (unsigned char) __builtin_ia32_kortestchi (__A, __B);
+ return (unsigned char) __builtin_ia32_kortestzhi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kortestz_mask16_u8 (__mmask16 __A, __mmask16 __B)
+{
+ return (unsigned char) __builtin_ia32_kortestzhi ((__mmask16) __A,
+ (__mmask16) __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kortestc_mask16_u8 (__mmask16 __A, __mmask16 __B)
+{
+ return (unsigned char) __builtin_ia32_kortestchi ((__mmask16) __A,
+ (__mmask16) __B);
+}
+
+extern __inline unsigned int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_cvtmask16_u32 (__mmask16 __A)
+{
+ return (unsigned int) __builtin_ia32_kmovw ((__mmask16 ) __A);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_cvtu32_mask16 (unsigned int __A)
+{
+ return (__mmask16) __builtin_ia32_kmovw ((__mmask16 ) __A);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_load_mask16 (__mmask16 *__A)
+{
+ return (__mmask16) __builtin_ia32_kmovw (*(__mmask16 *) __A);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_store_mask16 (__mmask16 *__A, __mmask16 __B)
+{
+ *(__mmask16 *) __A = __builtin_ia32_kmovw (__B);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_kand (__mmask16 __A, __mmask16 __B)
+{
+ return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_kandn (__mmask16 __A, __mmask16 __B)
+{
+ return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A,
+ (__mmask16) __B);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_kor (__mmask16 __A, __mmask16 __B)
+{
+ return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_kortestz (__mmask16 __A, __mmask16 __B)
+{
+ return (__mmask16) __builtin_ia32_kortestzhi ((__mmask16) __A,
+ (__mmask16) __B);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_kortestc (__mmask16 __A, __mmask16 __B)
+{
+ return (__mmask16) __builtin_ia32_kortestchi ((__mmask16) __A,
+ (__mmask16) __B);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_kxnor (__mmask16 __A, __mmask16 __B)
+{
+ return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_kxor (__mmask16 __A, __mmask16 __B)
+{
+ return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_knot (__mmask16 __A)
+{
+ return (__mmask16) __builtin_ia32_knothi ((__mmask16) __A);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_kunpackb (__mmask16 __A, __mmask16 __B)
+{
+ return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kunpackb_mask16 (__mmask8 __A, __mmask8 __B)
+{
+ return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_inserti32x4 (__mmask16 __B, __m512i __C, __m128i __D,
+ const int __imm)
+{
+ return (__m512i) __builtin_ia32_inserti32x4_mask ((__v16si) __C,
+ (__v4si) __D,
+ __imm,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ __B);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_insertf32x4 (__mmask16 __B, __m512 __C, __m128 __D,
+ const int __imm)
+{
+ return (__m512) __builtin_ia32_insertf32x4_mask ((__v16sf) __C,
+ (__v4sf) __D,
+ __imm,
+ (__v16sf)
+ _mm512_setzero_ps (), __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_inserti32x4 (__m512i __A, __mmask16 __B, __m512i __C,
+ __m128i __D, const int __imm)
+{
+ return (__m512i) __builtin_ia32_inserti32x4_mask ((__v16si) __C,
+ (__v4si) __D,
+ __imm,
+ (__v16si) __A,
+ __B);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_insertf32x4 (__m512 __A, __mmask16 __B, __m512 __C,
+ __m128 __D, const int __imm)
+{
+ return (__m512) __builtin_ia32_insertf32x4_mask ((__v16sf) __C,
+ (__v4sf) __D,
+ __imm,
+ (__v16sf) __A, __B);
+}
+#else
+#define _mm512_maskz_insertf32x4(A, X, Y, C) \
+ ((__m512) __builtin_ia32_insertf32x4_mask ((__v16sf)(__m512) (X), \
+ (__v4sf)(__m128) (Y), (int) (C), (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)(A)))
+
+#define _mm512_maskz_inserti32x4(A, X, Y, C) \
+ ((__m512i) __builtin_ia32_inserti32x4_mask ((__v16si)(__m512i) (X), \
+ (__v4si)(__m128i) (Y), (int) (C), (__v16si)_mm512_setzero_si512 (), \
+ (__mmask16)(A)))
+
+#define _mm512_mask_insertf32x4(A, B, X, Y, C) \
+ ((__m512) __builtin_ia32_insertf32x4_mask ((__v16sf)(__m512) (X), \
+ (__v4sf)(__m128) (Y), (int) (C), (__v16sf)(__m512) (A), \
+ (__mmask16)(B)))
+
+#define _mm512_mask_inserti32x4(A, B, X, Y, C) \
+ ((__m512i) __builtin_ia32_inserti32x4_mask ((__v16si)(__m512i) (X), \
+ (__v4si)(__m128i) (Y), (int) (C), (__v16si)(__m512i) (A), \
+ (__mmask16)(B)))
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_epi64 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di) __W, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_epi64 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di) __W, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_epu64 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di) __W, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_epu64 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di)
+ _mm512_undefined_epi32 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di) __W, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
+ (__v8di) __B,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_epi32 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si) __W, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_epi32 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si) __W, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_epu32 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si) __W, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_epu32 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
+ (__v16si) __B,
+ (__v16si) __W, __M);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_unpacklo_ps (__m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_unpcklps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_unpacklo_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_unpcklps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_unpcklps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_round_sd (__m128d __A, __m128d __B, const int __R)
+{
+ return (__m128d) __builtin_ia32_maxsd_round ((__v2df) __A,
+ (__v2df) __B,
+ __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_round_sd (__m128d __W, __mmask8 __U, __m128d __A,
+ __m128d __B, const int __R)
+{
+ return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
+ const int __R)
+{
+ return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_round_ss (__m128 __A, __m128 __B, const int __R)
+{
+ return (__m128) __builtin_ia32_maxss_round ((__v4sf) __A,
+ (__v4sf) __B,
+ __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_round_ss (__m128 __W, __mmask8 __U, __m128 __A,
+ __m128 __B, const int __R)
+{
+ return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
+ const int __R)
+{
+ return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_round_sd (__m128d __A, __m128d __B, const int __R)
+{
+ return (__m128d) __builtin_ia32_minsd_round ((__v2df) __A,
+ (__v2df) __B,
+ __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_round_sd (__m128d __W, __mmask8 __U, __m128d __A,
+ __m128d __B, const int __R)
+{
+ return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
+ const int __R)
+{
+ return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_round_ss (__m128 __A, __m128 __B, const int __R)
+{
+ return (__m128) __builtin_ia32_minss_round ((__v4sf) __A,
+ (__v4sf) __B,
+ __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_round_ss (__m128 __W, __mmask8 __U, __m128 __A,
+ __m128 __B, const int __R)
+{
+ return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
+ const int __R)
+{
+ return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U, __R);
+}
+
+#else
+#define _mm_max_round_sd(A, B, C) \
+ (__m128d)__builtin_ia32_maxsd_round(A, B, C)
+
+#define _mm_mask_max_round_sd(W, U, A, B, C) \
+ (__m128d)__builtin_ia32_maxsd_mask_round(A, B, W, U, C)
+
+#define _mm_maskz_max_round_sd(U, A, B, C) \
+ (__m128d)__builtin_ia32_maxsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C)
+
+#define _mm_max_round_ss(A, B, C) \
+ (__m128)__builtin_ia32_maxss_round(A, B, C)
+
+#define _mm_mask_max_round_ss(W, U, A, B, C) \
+ (__m128)__builtin_ia32_maxss_mask_round(A, B, W, U, C)
+
+#define _mm_maskz_max_round_ss(U, A, B, C) \
+ (__m128)__builtin_ia32_maxss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C)
+
+#define _mm_min_round_sd(A, B, C) \
+ (__m128d)__builtin_ia32_minsd_round(A, B, C)
+
+#define _mm_mask_min_round_sd(W, U, A, B, C) \
+ (__m128d)__builtin_ia32_minsd_mask_round(A, B, W, U, C)
+
+#define _mm_maskz_min_round_sd(U, A, B, C) \
+ (__m128d)__builtin_ia32_minsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C)
+
+#define _mm_min_round_ss(A, B, C) \
+ (__m128)__builtin_ia32_minss_round(A, B, C)
+
+#define _mm_mask_min_round_ss(W, U, A, B, C) \
+ (__m128)__builtin_ia32_minss_mask_round(A, B, W, U, C)
+
+#define _mm_maskz_min_round_ss(U, A, B, C) \
+ (__m128)__builtin_ia32_minss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C)
+
+#endif
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_blend_pd (__mmask8 __U, __m512d __A, __m512d __W)
+{
+ return (__m512d) __builtin_ia32_blendmpd_512_mask ((__v8df) __A,
+ (__v8df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_blend_ps (__mmask16 __U, __m512 __A, __m512 __W)
+{
+ return (__m512) __builtin_ia32_blendmps_512_mask ((__v16sf) __A,
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_blend_epi64 (__mmask8 __U, __m512i __A, __m512i __W)
+{
+ return (__m512i) __builtin_ia32_blendmq_512_mask ((__v8di) __A,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_blend_epi32 (__mmask16 __U, __m512i __A, __m512i __W)
+{
+ return (__m512i) __builtin_ia32_blendmd_512_mask ((__v16si) __A,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W,
+ (__v2df) __A,
+ (__v2df) __B,
+ __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W,
+ (__v4sf) __A,
+ (__v4sf) __B,
+ __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmsub_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W,
+ (__v2df) __A,
+ -(__v2df) __B,
+ __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmsub_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W,
+ (__v4sf) __A,
+ -(__v4sf) __B,
+ __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmadd_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W,
+ -(__v2df) __A,
+ (__v2df) __B,
+ __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmadd_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W,
+ -(__v4sf) __A,
+ (__v4sf) __B,
+ __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmsub_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W,
+ -(__v2df) __A,
+ -(__v2df) __B,
+ __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmsub_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W,
+ -(__v4sf) __A,
+ -(__v4sf) __B,
+ __R);
+}
+#else
+#define _mm_fmadd_round_sd(A, B, C, R) \
+ (__m128d)__builtin_ia32_vfmaddsd3_round(A, B, C, R)
+
+#define _mm_fmadd_round_ss(A, B, C, R) \
+ (__m128)__builtin_ia32_vfmaddss3_round(A, B, C, R)
+
+#define _mm_fmsub_round_sd(A, B, C, R) \
+ (__m128d)__builtin_ia32_vfmaddsd3_round(A, B, -(C), R)
+
+#define _mm_fmsub_round_ss(A, B, C, R) \
+ (__m128)__builtin_ia32_vfmaddss3_round(A, B, -(C), R)
+
+#define _mm_fnmadd_round_sd(A, B, C, R) \
+ (__m128d)__builtin_ia32_vfmaddsd3_round(A, -(B), C, R)
+
+#define _mm_fnmadd_round_ss(A, B, C, R) \
+ (__m128)__builtin_ia32_vfmaddss3_round(A, -(B), C, R)
+
+#define _mm_fnmsub_round_sd(A, B, C, R) \
+ (__m128d)__builtin_ia32_vfmaddsd3_round(A, -(B), -(C), R)
+
+#define _mm_fnmsub_round_ss(A, B, C, R) \
+ (__m128)__builtin_ia32_vfmaddss3_round(A, -(B), -(C), R)
+#endif
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W,
+ (__v2df) __A,
+ (__v2df) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+ (__v4sf) __A,
+ (__v4sf) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmadd_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W,
+ (__v2df) __A,
+ (__v2df) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmadd_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W,
+ (__v4sf) __A,
+ (__v4sf) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmadd_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W,
+ (__v2df) __A,
+ (__v2df) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmadd_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W,
+ (__v4sf) __A,
+ (__v4sf) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W,
+ (__v2df) __A,
+ -(__v2df) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+ (__v4sf) __A,
+ -(__v4sf) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmsub_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U)
+{
+ return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W,
+ (__v2df) __A,
+ (__v2df) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmsub_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U)
+{
+ return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W,
+ (__v4sf) __A,
+ (__v4sf) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmsub_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W,
+ (__v2df) __A,
+ -(__v2df) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmsub_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W,
+ (__v4sf) __A,
+ -(__v4sf) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W,
+ -(__v2df) __A,
+ (__v2df) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+ -(__v4sf) __A,
+ (__v4sf) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmadd_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W,
+ -(__v2df) __A,
+ (__v2df) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmadd_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W,
+ -(__v4sf) __A,
+ (__v4sf) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W,
+ -(__v2df) __A,
+ (__v2df) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W,
+ -(__v4sf) __A,
+ (__v4sf) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W,
+ -(__v2df) __A,
+ -(__v2df) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+ -(__v4sf) __A,
+ -(__v4sf) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmsub_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U)
+{
+ return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W,
+ -(__v2df) __A,
+ (__v2df) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmsub_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U)
+{
+ return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W,
+ -(__v4sf) __A,
+ (__v4sf) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W,
+ -(__v2df) __A,
+ -(__v2df) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W,
+ -(__v4sf) __A,
+ -(__v4sf) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmadd_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B,
+ const int __R)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W,
+ (__v2df) __A,
+ (__v2df) __B,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmadd_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B,
+ const int __R)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+ (__v4sf) __A,
+ (__v4sf) __B,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmadd_round_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U,
+ const int __R)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W,
+ (__v2df) __A,
+ (__v2df) __B,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmadd_round_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U,
+ const int __R)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W,
+ (__v4sf) __A,
+ (__v4sf) __B,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmadd_round_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B,
+ const int __R)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W,
+ (__v2df) __A,
+ (__v2df) __B,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmadd_round_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B,
+ const int __R)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W,
+ (__v4sf) __A,
+ (__v4sf) __B,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmsub_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B,
+ const int __R)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W,
+ (__v2df) __A,
+ -(__v2df) __B,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmsub_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B,
+ const int __R)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+ (__v4sf) __A,
+ -(__v4sf) __B,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmsub_round_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U,
+ const int __R)
+{
+ return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W,
+ (__v2df) __A,
+ (__v2df) __B,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmsub_round_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U,
+ const int __R)
+{
+ return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W,
+ (__v4sf) __A,
+ (__v4sf) __B,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmsub_round_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B,
+ const int __R)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W,
+ (__v2df) __A,
+ -(__v2df) __B,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmsub_round_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B,
+ const int __R)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W,
+ (__v4sf) __A,
+ -(__v4sf) __B,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmadd_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B,
+ const int __R)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W,
+ -(__v2df) __A,
+ (__v2df) __B,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmadd_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B,
+ const int __R)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+ -(__v4sf) __A,
+ (__v4sf) __B,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmadd_round_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U,
+ const int __R)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W,
+ -(__v2df) __A,
+ (__v2df) __B,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmadd_round_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U,
+ const int __R)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W,
+ -(__v4sf) __A,
+ (__v4sf) __B,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmadd_round_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B,
+ const int __R)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W,
+ -(__v2df) __A,
+ (__v2df) __B,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmadd_round_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B,
+ const int __R)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W,
+ -(__v4sf) __A,
+ (__v4sf) __B,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmsub_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B,
+ const int __R)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W,
+ -(__v2df) __A,
+ -(__v2df) __B,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmsub_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B,
+ const int __R)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+ -(__v4sf) __A,
+ -(__v4sf) __B,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmsub_round_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U,
+ const int __R)
+{
+ return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W,
+ -(__v2df) __A,
+ (__v2df) __B,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmsub_round_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U,
+ const int __R)
+{
+ return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W,
+ -(__v4sf) __A,
+ (__v4sf) __B,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmsub_round_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B,
+ const int __R)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W,
+ -(__v2df) __A,
+ -(__v2df) __B,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmsub_round_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B,
+ const int __R)
+{
+ return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W,
+ -(__v4sf) __A,
+ -(__v4sf) __B,
+ (__mmask8) __U, __R);
+}
+#else
+#define _mm_mask_fmadd_round_sd(A, U, B, C, R) \
+ (__m128d) __builtin_ia32_vfmaddsd3_mask (A, B, C, U, R)
+
+#define _mm_mask_fmadd_round_ss(A, U, B, C, R) \
+ (__m128) __builtin_ia32_vfmaddss3_mask (A, B, C, U, R)
+
+#define _mm_mask3_fmadd_round_sd(A, B, C, U, R) \
+ (__m128d) __builtin_ia32_vfmaddsd3_mask3 (A, B, C, U, R)
+
+#define _mm_mask3_fmadd_round_ss(A, B, C, U, R) \
+ (__m128) __builtin_ia32_vfmaddss3_mask3 (A, B, C, U, R)
+
+#define _mm_maskz_fmadd_round_sd(U, A, B, C, R) \
+ (__m128d) __builtin_ia32_vfmaddsd3_maskz (A, B, C, U, R)
+
+#define _mm_maskz_fmadd_round_ss(U, A, B, C, R) \
+ (__m128) __builtin_ia32_vfmaddss3_maskz (A, B, C, U, R)
+
+#define _mm_mask_fmsub_round_sd(A, U, B, C, R) \
+ (__m128d) __builtin_ia32_vfmaddsd3_mask (A, B, -(C), U, R)
+
+#define _mm_mask_fmsub_round_ss(A, U, B, C, R) \
+ (__m128) __builtin_ia32_vfmaddss3_mask (A, B, -(C), U, R)
+
+#define _mm_mask3_fmsub_round_sd(A, B, C, U, R) \
+ (__m128d) __builtin_ia32_vfmsubsd3_mask3 (A, B, C, U, R)
+
+#define _mm_mask3_fmsub_round_ss(A, B, C, U, R) \
+ (__m128) __builtin_ia32_vfmsubss3_mask3 (A, B, C, U, R)
+
+#define _mm_maskz_fmsub_round_sd(U, A, B, C, R) \
+ (__m128d) __builtin_ia32_vfmaddsd3_maskz (A, B, -(C), U, R)
+
+#define _mm_maskz_fmsub_round_ss(U, A, B, C, R) \
+ (__m128) __builtin_ia32_vfmaddss3_maskz (A, B, -(C), U, R)
+
+#define _mm_mask_fnmadd_round_sd(A, U, B, C, R) \
+ (__m128d) __builtin_ia32_vfmaddsd3_mask (A, -(B), C, U, R)
+
+#define _mm_mask_fnmadd_round_ss(A, U, B, C, R) \
+ (__m128) __builtin_ia32_vfmaddss3_mask (A, -(B), C, U, R)
+
+#define _mm_mask3_fnmadd_round_sd(A, B, C, U, R) \
+ (__m128d) __builtin_ia32_vfmaddsd3_mask3 (A, -(B), C, U, R)
+
+#define _mm_mask3_fnmadd_round_ss(A, B, C, U, R) \
+ (__m128) __builtin_ia32_vfmaddss3_mask3 (A, -(B), C, U, R)
+
+#define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) \
+ (__m128d) __builtin_ia32_vfmaddsd3_maskz (A, -(B), C, U, R)
+
+#define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) \
+ (__m128) __builtin_ia32_vfmaddss3_maskz (A, -(B), C, U, R)
+
+#define _mm_mask_fnmsub_round_sd(A, U, B, C, R) \
+ (__m128d) __builtin_ia32_vfmaddsd3_mask (A, -(B), -(C), U, R)
+
+#define _mm_mask_fnmsub_round_ss(A, U, B, C, R) \
+ (__m128) __builtin_ia32_vfmaddss3_mask (A, -(B), -(C), U, R)
+
+#define _mm_mask3_fnmsub_round_sd(A, B, C, U, R) \
+ (__m128d) __builtin_ia32_vfmsubsd3_mask3 (A, -(B), C, U, R)
+
+#define _mm_mask3_fnmsub_round_ss(A, B, C, U, R) \
+ (__m128) __builtin_ia32_vfmsubss3_mask3 (A, -(B), C, U, R)
+
+#define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) \
+ (__m128d) __builtin_ia32_vfmaddsd3_maskz (A, -(B), -(C), U, R)
+
+#define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) \
+ (__m128) __builtin_ia32_vfmaddss3_maskz (A, -(B), -(C), U, R)
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comi_round_ss (__m128 __A, __m128 __B, const int __P, const int __R)
+{
+ return __builtin_ia32_vcomiss ((__v4sf) __A, (__v4sf) __B, __P, __R);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comi_round_sd (__m128d __A, __m128d __B, const int __P, const int __R)
+{
+ return __builtin_ia32_vcomisd ((__v2df) __A, (__v2df) __B, __P, __R);
+}
+#else
+#define _mm_comi_round_ss(A, B, C, D)\
+__builtin_ia32_vcomiss(A, B, C, D)
+#define _mm_comi_round_sd(A, B, C, D)\
+__builtin_ia32_vcomisd(A, B, C, D)
+#endif
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sqrt_pd (__m512d __A)
+{
+ return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+ return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
+ (__v8df) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A)
+{
+ return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sqrt_ps (__m512 __A)
+{
+ return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sqrt_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A,
+ (__v16sf) __W,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sqrt_ps (__mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_add_pd (__m512d __A, __m512d __B)
+{
+ return (__m512d) ((__v8df)__A + (__v8df)__B);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_add_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_add_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_add_ps (__m512 __A, __m512 __B)
+{
+ return (__m512) ((__v16sf)__A + (__v16sf)__B);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_add_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __W,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_add_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_add_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_add_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_add_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_add_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sub_pd (__m512d __A, __m512d __B)
+{
+ return (__m512d) ((__v8df)__A - (__v8df)__B);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sub_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sub_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sub_ps (__m512 __A, __m512 __B)
+{
+ return (__m512) ((__v16sf)__A - (__v16sf)__B);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sub_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __W,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sub_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sub_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sub_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mul_pd (__m512d __A, __m512d __B)
+{
+ return (__m512d) ((__v8df)__A * (__v8df)__B);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mul_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mul_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mul_ps (__m512 __A, __m512 __B)
+{
+ return (__m512) ((__v16sf)__A * (__v16sf)__B);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mul_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __W,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mul_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mul_sd (__m128d __W, __mmask8 __U, __m128d __A,
+ __m128d __B)
+{
+ return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mul_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mul_ss (__m128 __W, __mmask8 __U, __m128 __A,
+ __m128 __B)
+{
+ return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mul_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_div_pd (__m512d __M, __m512d __V)
+{
+ return (__m512d) ((__v8df)__M / (__v8df)__V);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_div_pd (__m512d __W, __mmask8 __U, __m512d __M, __m512d __V)
+{
+ return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M,
+ (__v8df) __V,
+ (__v8df) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_div_pd (__mmask8 __U, __m512d __M, __m512d __V)
+{
+ return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M,
+ (__v8df) __V,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_div_ps (__m512 __A, __m512 __B)
+{
+ return (__m512) ((__v16sf)__A / (__v16sf)__B);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_div_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __W,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_div_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_div_sd (__m128d __W, __mmask8 __U, __m128d __A,
+ __m128d __B)
+{
+ return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_div_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_div_ss (__m128 __W, __mmask8 __U, __m128 __A,
+ __m128 __B)
+{
+ return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_div_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_pd (__m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_ps (__m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __W,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_pd (__m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_ps (__m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __W,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_scalef_pd (__m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_scalef_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+ return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_scalef_ps (__m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_scalef_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __W,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+ return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_scalef_sd (__m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_scalef_ss (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmadd_pd (__m512d __A, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmadd_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmadd_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+ return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmadd_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmadd_ps (__m512 __A, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmadd_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmadd_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+ return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmadd_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmsub_pd (__m512d __A, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfmsubpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmsub_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfmsubpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmsub_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+ return (__m512d) __builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmsub_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfmsubpd512_maskz ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmsub_ps (__m512 __A, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfmsubps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmsub_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfmsubps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmsub_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+ return (__m512) __builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmsub_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfmsubps512_maskz ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmaddsub_pd (__m512d __A, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmaddsub_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmaddsub_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+ return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmaddsub_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmaddsub_ps (__m512 __A, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmaddsub_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmaddsub_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+ return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmaddsub_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmsubadd_pd (__m512d __A, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ -(__v8df) __C,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmsubadd_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ -(__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmsubadd_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+ return (__m512d) __builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmsubadd_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
+ (__v8df) __B,
+ -(__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmsubadd_ps (__m512 __A, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ -(__v16sf) __C,
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmsubadd_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ -(__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmsubadd_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+ return (__m512) __builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmsubadd_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
+ (__v16sf) __B,
+ -(__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fnmadd_pd (__m512d __A, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fnmadd_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fnmadd_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+ return (__m512d) __builtin_ia32_vfnmaddpd512_mask3 ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fnmadd_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfnmaddpd512_maskz ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fnmadd_ps (__m512 __A, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fnmadd_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fnmadd_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+ return (__m512) __builtin_ia32_vfnmaddps512_mask3 ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fnmadd_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfnmaddps512_maskz ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fnmsub_pd (__m512d __A, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fnmsub_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fnmsub_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+ return (__m512d) __builtin_ia32_vfnmsubpd512_mask3 ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fnmsub_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+ return (__m512d) __builtin_ia32_vfnmsubpd512_maskz ((__v8df) __A,
+ (__v8df) __B,
+ (__v8df) __C,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fnmsub_ps (__m512 __A, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fnmsub_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fnmsub_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+ return (__m512) __builtin_ia32_vfnmsubps512_mask3 ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fnmsub_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+ return (__m512) __builtin_ia32_vfnmsubps512_maskz ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16sf) __C,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttpd_epi32 (__m512d __A)
+{
+ return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
+ (__v8si)
+ _mm256_undefined_si256 (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+ return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
+ (__v8si) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A)
+{
+ return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttpd_epu32 (__m512d __A)
+{
+ return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+ (__v8si)
+ _mm256_undefined_si256 (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+ return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+ (__v8si) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A)
+{
+ return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtpd_epi32 (__m512d __A)
+{
+ return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
+ (__v8si)
+ _mm256_undefined_si256 (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+ return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
+ (__v8si) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A)
+{
+ return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtpd_epu32 (__m512d __A)
+{
+ return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
+ (__v8si)
+ _mm256_undefined_si256 (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+ return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
+ (__v8si) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A)
+{
+ return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttps_epi32 (__m512 __A)
+{
+ return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+ return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
+ (__v16si) __W,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A)
+{
+ return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttps_epu32 (__m512 __A)
+{
+ return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+ return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
+ (__v16si) __W,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A)
+{
+ return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtps_epi32 (__m512 __A)
+{
+ return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+ return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
+ (__v16si) __W,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A)
+{
+ return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtps_epu32 (__m512 __A)
+{
+ return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
+ (__v16si)
+ _mm512_undefined_epi32 (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+ return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
+ (__v16si) __W,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtps_epu32 (__mmask16 __U, __m512 __A)
+{
+ return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline double
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtsd_f64 (__m512d __A)
+{
+ return __A[0];
+}
+
+extern __inline float
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtss_f32 (__m512 __A)
+{
+ return __A[0];
+}
+
+#ifdef __x86_64__
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtu64_ss (__m128 __A, unsigned long long __B)
+{
+ return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtu64_sd (__m128d __A, unsigned long long __B)
+{
+ return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtu32_ss (__m128 __A, unsigned __B)
+{
+ return (__m128) __builtin_ia32_cvtusi2ss32 ((__v4sf) __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi32_ps (__m512i __A)
+{
+ return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A)
+{
+ return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
+ (__v16sf) __W,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A)
+{
+ return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu32_ps (__m512i __A)
+{
+ return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A)
+{
+ return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
+ (__v16sf) __W,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A)
+{
+ return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fixupimm_pd (__m512d __A, __m512d __B, __m512i __C, const int __imm)
+{
+ return (__m512d) __builtin_ia32_fixupimmpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8di) __C,
+ __imm,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fixupimm_pd (__m512d __A, __mmask8 __U, __m512d __B,
+ __m512i __C, const int __imm)
+{
+ return (__m512d) __builtin_ia32_fixupimmpd512_mask ((__v8df) __A,
+ (__v8df) __B,
+ (__v8di) __C,
+ __imm,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fixupimm_pd (__mmask8 __U, __m512d __A, __m512d __B,
+ __m512i __C, const int __imm)
+{
+ return (__m512d) __builtin_ia32_fixupimmpd512_maskz ((__v8df) __A,
+ (__v8df) __B,
+ (__v8di) __C,
+ __imm,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fixupimm_ps (__m512 __A, __m512 __B, __m512i __C, const int __imm)
+{
+ return (__m512) __builtin_ia32_fixupimmps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16si) __C,
+ __imm,
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fixupimm_ps (__m512 __A, __mmask16 __U, __m512 __B,
+ __m512i __C, const int __imm)
+{
+ return (__m512) __builtin_ia32_fixupimmps512_mask ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16si) __C,
+ __imm,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fixupimm_ps (__mmask16 __U, __m512 __A, __m512 __B,
+ __m512i __C, const int __imm)
+{
+ return (__m512) __builtin_ia32_fixupimmps512_maskz ((__v16sf) __A,
+ (__v16sf) __B,
+ (__v16si) __C,
+ __imm,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fixupimm_sd (__m128d __A, __m128d __B, __m128i __C, const int __imm)
+{
+ return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2di) __C, __imm,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fixupimm_sd (__m128d __A, __mmask8 __U, __m128d __B,
+ __m128i __C, const int __imm)
+{
+ return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2di) __C, __imm,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fixupimm_sd (__mmask8 __U, __m128d __A, __m128d __B,
+ __m128i __C, const int __imm)
+{
+ return (__m128d) __builtin_ia32_fixupimmsd_maskz ((__v2df) __A,
+ (__v2df) __B,
+ (__v2di) __C,
+ __imm,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fixupimm_ss (__m128 __A, __m128 __B, __m128i __C, const int __imm)
+{
+ return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4si) __C, __imm,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fixupimm_ss (__m128 __A, __mmask8 __U, __m128 __B,
+ __m128i __C, const int __imm)
+{
+ return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4si) __C, __imm,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fixupimm_ss (__mmask8 __U, __m128 __A, __m128 __B,
+ __m128i __C, const int __imm)
+{
+ return (__m128) __builtin_ia32_fixupimmss_maskz ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4si) __C, __imm,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+#else
+#define _mm512_fixupimm_pd(X, Y, Z, C) \
+ ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X), \
+ (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C), \
+ (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_fixupimm_pd(X, U, Y, Z, C) \
+ ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X), \
+ (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C), \
+ (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_fixupimm_pd(U, X, Y, Z, C) \
+ ((__m512d)__builtin_ia32_fixupimmpd512_maskz ((__v8df)(__m512d)(X), \
+ (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C), \
+ (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_fixupimm_ps(X, Y, Z, C) \
+ ((__m512)__builtin_ia32_fixupimmps512_mask ((__v16sf)(__m512)(X), \
+ (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C), \
+ (__mmask16)(-1), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_fixupimm_ps(X, U, Y, Z, C) \
+ ((__m512)__builtin_ia32_fixupimmps512_mask ((__v16sf)(__m512)(X), \
+ (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C), \
+ (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_fixupimm_ps(U, X, Y, Z, C) \
+ ((__m512)__builtin_ia32_fixupimmps512_maskz ((__v16sf)(__m512)(X), \
+ (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C), \
+ (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_fixupimm_sd(X, Y, Z, C) \
+ ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), \
+ (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_fixupimm_sd(X, U, Y, Z, C) \
+ ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), \
+ (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_fixupimm_sd(U, X, Y, Z, C) \
+ ((__m128d)__builtin_ia32_fixupimmsd_maskz ((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), \
+ (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_fixupimm_ss(X, Y, Z, C) \
+ ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), \
+ (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_fixupimm_ss(X, U, Y, Z, C) \
+ ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), \
+ (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_fixupimm_ss(U, X, Y, Z, C) \
+ ((__m128)__builtin_ia32_fixupimmss_maskz ((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), \
+ (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+#endif
+
+#ifdef __x86_64__
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_u64 (__m128 __A)
+{
+ return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf)
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttss_u64 (__m128 __A)
+{
+ return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf)
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttss_i64 (__m128 __A)
+{
+ return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+#endif /* __x86_64__ */
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtsi512_si32 (__m512i __A)
+{
+ __v16si __B = (__v16si) __A;
+ return __B[0];
+}
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_u32 (__m128 __A)
+{
+ return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttss_u32 (__m128 __A)
+{
+ return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttss_i32 (__m128 __A)
+{
+ return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_i32 (__m128d __A)
+{
+ return (int) __builtin_ia32_cvtsd2si ((__v2df) __A);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_i32 (__m128 __A)
+{
+ return (int) __builtin_ia32_cvtss2si ((__v4sf) __A);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvti32_sd (__m128d __A, int __B)
+{
+ return (__m128d) __builtin_ia32_cvtsi2sd ((__v2df) __A, __B);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvti32_ss (__m128 __A, int __B)
+{
+ return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B);
+}
+
+#ifdef __x86_64__
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_u64 (__m128d __A)
+{
+ return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df)
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsd_u64 (__m128d __A)
+{
+ return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df)
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsd_i64 (__m128d __A)
+{
+ return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_i64 (__m128d __A)
+{
+ return (long long) __builtin_ia32_cvtsd2si64 ((__v2df) __A);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_i64 (__m128 __A)
+{
+ return (long long) __builtin_ia32_cvtss2si64 ((__v4sf) __A);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvti64_sd (__m128d __A, long long __B)
+{
+ return (__m128d) __builtin_ia32_cvtsi642sd ((__v2df) __A, __B);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvti64_ss (__m128 __A, long long __B)
+{
+ return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
+}
+#endif /* __x86_64__ */
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_u32 (__m128d __A)
+{
+ return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsd_u32 (__m128d __A)
+{
+ return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsd_i32 (__m128d __A)
+{
+ return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtps_pd (__m256 __A)
+{
+ return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A)
+{
+ return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
+ (__v8df) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A)
+{
+ return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtph_ps (__m256i __A)
+{
+ return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A)
+{
+ return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
+ (__v16sf) __W,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A)
+{
+ return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtpd_ps (__m512d __A)
+{
+ return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
+ (__v8sf)
+ _mm256_undefined_ps (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A)
+{
+ return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
+ (__v8sf) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A)
+{
+ return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getexp_ps (__m512 __A)
+{
+ return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getexp_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+ (__v16sf) __W,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A)
+{
+ return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getexp_pd (__m512d __A)
+{
+ return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+ (__v8df)
+ _mm512_undefined_pd (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getexp_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+ return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+ (__v8df) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A)
+{
+ return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getexp_ss (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_getexpss128_round ((__v4sf) __A,
+ (__v4sf) __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getexp_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_getexpss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_getexpss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getexp_sd (__m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_getexpsd128_round ((__v2df) __A,
+ (__v2df) __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_getexpsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_getexpsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getmant_pd (__m512d __A, _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A,
+ (__C << 2) | __B,
+ _mm512_undefined_pd (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getmant_pd (__m512d __W, __mmask8 __U, __m512d __A,
+ _MM_MANTISSA_NORM_ENUM __B, _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A,
+ (__C << 2) | __B,
+ (__v8df) __W, __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getmant_pd (__mmask8 __U, __m512d __A,
+ _MM_MANTISSA_NORM_ENUM __B, _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A,
+ (__C << 2) | __B,
+ (__v8df)
+ _mm512_setzero_pd (),
+ __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getmant_ps (__m512 __A, _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A,
+ (__C << 2) | __B,
+ _mm512_undefined_ps (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getmant_ps (__m512 __W, __mmask16 __U, __m512 __A,
+ _MM_MANTISSA_NORM_ENUM __B, _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A,
+ (__C << 2) | __B,
+ (__v16sf) __W, __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getmant_ps (__mmask16 __U, __m512 __A,
+ _MM_MANTISSA_NORM_ENUM __B, _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A,
+ (__C << 2) | __B,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getmant_sd (__m128d __A, __m128d __B, _MM_MANTISSA_NORM_ENUM __C,
+ _MM_MANTISSA_SIGN_ENUM __D)
+{
+ return (__m128d) __builtin_ia32_getmantsd_round ((__v2df) __A,
+ (__v2df) __B,
+ (__D << 2) | __C,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getmant_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B,
+ _MM_MANTISSA_NORM_ENUM __C, _MM_MANTISSA_SIGN_ENUM __D)
+{
+ return (__m128d) __builtin_ia32_getmantsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__D << 2) | __C,
+ (__v2df) __W,
+ __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getmant_sd (__mmask8 __U, __m128d __A, __m128d __B,
+ _MM_MANTISSA_NORM_ENUM __C, _MM_MANTISSA_SIGN_ENUM __D)
+{
+ return (__m128d) __builtin_ia32_getmantsd_mask_round ((__v2df) __A,
+ (__v2df) __B,
+ (__D << 2) | __C,
+ (__v2df)
+ _mm_setzero_pd(),
+ __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getmant_ss (__m128 __A, __m128 __B, _MM_MANTISSA_NORM_ENUM __C,
+ _MM_MANTISSA_SIGN_ENUM __D)
+{
+ return (__m128) __builtin_ia32_getmantss_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__D << 2) | __C,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getmant_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B,
+ _MM_MANTISSA_NORM_ENUM __C, _MM_MANTISSA_SIGN_ENUM __D)
+{
+ return (__m128) __builtin_ia32_getmantss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__D << 2) | __C,
+ (__v4sf) __W,
+ __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getmant_ss (__mmask8 __U, __m128 __A, __m128 __B,
+ _MM_MANTISSA_NORM_ENUM __C, _MM_MANTISSA_SIGN_ENUM __D)
+{
+ return (__m128) __builtin_ia32_getmantss_mask_round ((__v4sf) __A,
+ (__v4sf) __B,
+ (__D << 2) | __C,
+ (__v4sf)
+ _mm_setzero_ps(),
+ __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#else
+#define _mm512_getmant_pd(X, B, C) \
+ ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X), \
+ (int)(((C)<<2) | (B)), \
+ (__v8df)_mm512_undefined_pd(), \
+ (__mmask8)-1,\
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_getmant_pd(W, U, X, B, C) \
+ ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X), \
+ (int)(((C)<<2) | (B)), \
+ (__v8df)(__m512d)(W), \
+ (__mmask8)(U),\
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_getmant_pd(U, X, B, C) \
+ ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X), \
+ (int)(((C)<<2) | (B)), \
+ (__v8df)_mm512_setzero_pd(), \
+ (__mmask8)(U),\
+ _MM_FROUND_CUR_DIRECTION))
+#define _mm512_getmant_ps(X, B, C) \
+ ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X), \
+ (int)(((C)<<2) | (B)), \
+ (__v16sf)_mm512_undefined_ps(), \
+ (__mmask16)-1,\
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_getmant_ps(W, U, X, B, C) \
+ ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X), \
+ (int)(((C)<<2) | (B)), \
+ (__v16sf)(__m512)(W), \
+ (__mmask16)(U),\
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_getmant_ps(U, X, B, C) \
+ ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X), \
+ (int)(((C)<<2) | (B)), \
+ (__v16sf)_mm512_setzero_ps(), \
+ (__mmask16)(U),\
+ _MM_FROUND_CUR_DIRECTION))
+#define _mm_getmant_sd(X, Y, C, D) \
+ ((__m128d)__builtin_ia32_getmantsd_round ((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), \
+ (int)(((D)<<2) | (C)), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_getmant_sd(W, U, X, Y, C, D) \
+ ((__m128d)__builtin_ia32_getmantsd_mask_round ((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), \
+ (int)(((D)<<2) | (C)), \
+ (__v2df)(__m128d)(W), \
+ (__mmask8)(U),\
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_getmant_sd(U, X, Y, C, D) \
+ ((__m128d)__builtin_ia32_getmantsd_mask_round ((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), \
+ (int)(((D)<<2) | (C)), \
+ (__v2df)_mm_setzero_pd(), \
+ (__mmask8)(U),\
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_getmant_ss(X, Y, C, D) \
+ ((__m128)__builtin_ia32_getmantss_round ((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), \
+ (int)(((D)<<2) | (C)), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_getmant_ss(W, U, X, Y, C, D) \
+ ((__m128)__builtin_ia32_getmantss_mask_round ((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), \
+ (int)(((D)<<2) | (C)), \
+ (__v4sf)(__m128)(W), \
+ (__mmask8)(U),\
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_getmant_ss(U, X, Y, C, D) \
+ ((__m128)__builtin_ia32_getmantss_mask_round ((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), \
+ (int)(((D)<<2) | (C)), \
+ (__v4sf)_mm_setzero_ps(), \
+ (__mmask8)(U),\
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_getexp_ss(A, B) \
+ ((__m128)__builtin_ia32_getexpss128_round((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_getexp_ss(W, U, A, B) \
+ (__m128)__builtin_ia32_getexpss_mask_round(A, B, W, U,\
+ _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_getexp_ss(U, A, B) \
+ (__m128)__builtin_ia32_getexpss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U,\
+ _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_getexp_sd(A, B) \
+ ((__m128d)__builtin_ia32_getexpsd128_round((__v2df)(__m128d)(A), (__v2df)(__m128d)(B),\
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_getexp_sd(W, U, A, B) \
+ (__m128d)__builtin_ia32_getexpsd_mask_round(A, B, W, U,\
+ _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_getexp_sd(U, A, B) \
+ (__m128d)__builtin_ia32_getexpsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U,\
+ _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_getexp_ps(A) \
+ ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_getexp_ps(W, U, A) \
+ ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)(__m512)(W), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_getexp_ps(U, A) \
+ ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
+ (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_getexp_pd(A) \
+ ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)_mm512_undefined_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_getexp_pd(W, U, A) \
+ ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)(__m512d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_getexp_pd(U, A) \
+ ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
+ (__v8df)_mm512_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_roundscale_ps (__m512 __A, const int __imm)
+{
+ return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, __imm,
+ (__v16sf)
+ _mm512_undefined_ps (),
+ -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_roundscale_ps (__m512 __A, __mmask16 __B, __m512 __C,
+ const int __imm)
+{
+ return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __C, __imm,
+ (__v16sf) __A,
+ (__mmask16) __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_roundscale_ps (__mmask16 __A, __m512 __B, const int __imm)
+{
+ return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __B,
+ __imm,
+ (__v16sf)
+ _mm512_setzero_ps (),
+ (__mmask16) __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_roundscale_pd (__m512d __A, const int __imm)
+{
+ return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, __imm,
+ (__v8df)
+ _mm512_undefined_pd (),
+ -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_roundscale_pd (__m512d __A, __mmask8 __B, __m512d __C,
+ const int __imm)
+{
+ return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __C, __imm,
+ (__v8df) __A,
+ (__mmask8) __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_roundscale_pd (__mmask8 __A, __m512d __B, const int __imm)
+{
+ return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __B,
+ __imm,
+ (__v8df)
+ _mm512_setzero_pd (),
+ (__mmask8) __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roundscale_ss (__m128 __A, __m128 __B, const int __imm)
+{
+ return (__m128)
+ __builtin_ia32_rndscaless_mask_round ((__v4sf) __A,
+ (__v4sf) __B, __imm,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_roundscale_ss (__m128 __A, __mmask8 __B, __m128 __C, __m128 __D,
+ const int __imm)
+{
+ return (__m128)
+ __builtin_ia32_rndscaless_mask_round ((__v4sf) __C,
+ (__v4sf) __D, __imm,
+ (__v4sf) __A,
+ (__mmask8) __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_roundscale_ss (__mmask8 __A, __m128 __B, __m128 __C,
+ const int __imm)
+{
+ return (__m128)
+ __builtin_ia32_rndscaless_mask_round ((__v4sf) __B,
+ (__v4sf) __C, __imm,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roundscale_sd (__m128d __A, __m128d __B, const int __imm)
+{
+ return (__m128d)
+ __builtin_ia32_rndscalesd_mask_round ((__v2df) __A,
+ (__v2df) __B, __imm,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_roundscale_sd (__m128d __A, __mmask8 __B, __m128d __C, __m128d __D,
+ const int __imm)
+{
+ return (__m128d)
+ __builtin_ia32_rndscalesd_mask_round ((__v2df) __C,
+ (__v2df) __D, __imm,
+ (__v2df) __A,
+ (__mmask8) __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_roundscale_sd (__mmask8 __A, __m128d __B, __m128d __C,
+ const int __imm)
+{
+ return (__m128d)
+ __builtin_ia32_rndscalesd_mask_round ((__v2df) __B,
+ (__v2df) __C, __imm,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#else
+#define _mm512_roundscale_ps(A, B) \
+ ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(A), (int)(B),\
+ (__v16sf)_mm512_undefined_ps(), (__mmask16)(-1), _MM_FROUND_CUR_DIRECTION))
+#define _mm512_mask_roundscale_ps(A, B, C, D) \
+ ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(C), \
+ (int)(D), \
+ (__v16sf)(__m512)(A), \
+ (__mmask16)(B), _MM_FROUND_CUR_DIRECTION))
+#define _mm512_maskz_roundscale_ps(A, B, C) \
+ ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(B), \
+ (int)(C), \
+ (__v16sf)_mm512_setzero_ps(),\
+ (__mmask16)(A), _MM_FROUND_CUR_DIRECTION))
+#define _mm512_roundscale_pd(A, B) \
+ ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(A), (int)(B),\
+ (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION))
+#define _mm512_mask_roundscale_pd(A, B, C, D) \
+ ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(C), \
+ (int)(D), \
+ (__v8df)(__m512d)(A), \
+ (__mmask8)(B), _MM_FROUND_CUR_DIRECTION))
+#define _mm512_maskz_roundscale_pd(A, B, C) \
+ ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(B), \
+ (int)(C), \
+ (__v8df)_mm512_setzero_pd(),\
+ (__mmask8)(A), _MM_FROUND_CUR_DIRECTION))
+#define _mm_roundscale_ss(A, B, I) \
+ ((__m128) \
+ __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (A), \
+ (__v4sf) (__m128) (B), \
+ (int) (I), \
+ (__v4sf) _mm_setzero_ps (), \
+ (__mmask8) (-1), \
+ _MM_FROUND_CUR_DIRECTION))
+#define _mm_mask_roundscale_ss(A, U, B, C, I) \
+ ((__m128) \
+ __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (B), \
+ (__v4sf) (__m128) (C), \
+ (int) (I), \
+ (__v4sf) (__m128) (A), \
+ (__mmask8) (U), \
+ _MM_FROUND_CUR_DIRECTION))
+#define _mm_maskz_roundscale_ss(U, A, B, I) \
+ ((__m128) \
+ __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (A), \
+ (__v4sf) (__m128) (B), \
+ (int) (I), \
+ (__v4sf) _mm_setzero_ps (), \
+ (__mmask8) (U), \
+ _MM_FROUND_CUR_DIRECTION))
+#define _mm_roundscale_sd(A, B, I) \
+ ((__m128d) \
+ __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (A), \
+ (__v2df) (__m128d) (B), \
+ (int) (I), \
+ (__v2df) _mm_setzero_pd (), \
+ (__mmask8) (-1), \
+ _MM_FROUND_CUR_DIRECTION))
+#define _mm_mask_roundscale_sd(A, U, B, C, I) \
+ ((__m128d) \
+ __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (B), \
+ (__v2df) (__m128d) (C), \
+ (int) (I), \
+ (__v2df) (__m128d) (A), \
+ (__mmask8) (U), \
+ _MM_FROUND_CUR_DIRECTION))
+#define _mm_maskz_roundscale_sd(U, A, B, I) \
+ ((__m128d) \
+ __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (A), \
+ (__v2df) (__m128d) (B), \
+ (int) (I), \
+ (__v2df) _mm_setzero_pd (), \
+ (__mmask8) (U), \
+ _MM_FROUND_CUR_DIRECTION))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_pd_mask (__m512d __X, __m512d __Y, const int __P)
+{
+ return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+ (__v8df) __Y, __P,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_ps_mask (__m512 __X, __m512 __Y, const int __P)
+{
+ return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+ (__v16sf) __Y, __P,
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y, const int __P)
+{
+ return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+ (__v16sf) __Y, __P,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y, const int __P)
+{
+ return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+ (__v8df) __Y, __P,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_sd_mask (__m128d __X, __m128d __Y, const int __P)
+{
+ return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X,
+ (__v2df) __Y, __P,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_sd_mask (__mmask8 __M, __m128d __X, __m128d __Y, const int __P)
+{
+ return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X,
+ (__v2df) __Y, __P,
+ (__mmask8) __M,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_ss_mask (__m128 __X, __m128 __Y, const int __P)
+{
+ return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X,
+ (__v4sf) __Y, __P,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_ss_mask (__mmask8 __M, __m128 __X, __m128 __Y, const int __P)
+{
+ return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X,
+ (__v4sf) __Y, __P,
+ (__mmask8) __M,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#else
+#define _mm512_cmp_pd_mask(X, Y, P) \
+ ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X), \
+ (__v8df)(__m512d)(Y), (int)(P),\
+ (__mmask8)-1,_MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_cmp_ps_mask(X, Y, P) \
+ ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X), \
+ (__v16sf)(__m512)(Y), (int)(P),\
+ (__mmask16)-1,_MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_cmp_pd_mask(M, X, Y, P) \
+ ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X), \
+ (__v8df)(__m512d)(Y), (int)(P),\
+ (__mmask8)(M), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_cmp_ps_mask(M, X, Y, P) \
+ ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X), \
+ (__v16sf)(__m512)(Y), (int)(P),\
+ (__mmask16)(M),_MM_FROUND_CUR_DIRECTION))
+
+#define _mm_cmp_sd_mask(X, Y, P) \
+ ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), (int)(P),\
+ (__mmask8)-1,_MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_cmp_sd_mask(M, X, Y, P) \
+ ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), (int)(P),\
+ M,_MM_FROUND_CUR_DIRECTION))
+
+#define _mm_cmp_ss_mask(X, Y, P) \
+ ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), (int)(P), \
+ (__mmask8)-1,_MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_cmp_ss_mask(M, X, Y, P) \
+ ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), (int)(P), \
+ M,_MM_FROUND_CUR_DIRECTION))
+#endif
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpeq_pd_mask (__m512d __X, __m512d __Y)
+{
+ return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+ (__v8df) __Y, _CMP_EQ_OQ,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpeq_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y)
+{
+ return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+ (__v8df) __Y, _CMP_EQ_OQ,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmplt_pd_mask (__m512d __X, __m512d __Y)
+{
+ return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+ (__v8df) __Y, _CMP_LT_OS,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmplt_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y)
+{
+ return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+ (__v8df) __Y, _CMP_LT_OS,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmple_pd_mask (__m512d __X, __m512d __Y)
+{
+ return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+ (__v8df) __Y, _CMP_LE_OS,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmple_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y)
+{
+ return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+ (__v8df) __Y, _CMP_LE_OS,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpunord_pd_mask (__m512d __X, __m512d __Y)
+{
+ return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+ (__v8df) __Y, _CMP_UNORD_Q,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpunord_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y)
+{
+ return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+ (__v8df) __Y, _CMP_UNORD_Q,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpneq_pd_mask (__m512d __X, __m512d __Y)
+{
+ return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+ (__v8df) __Y, _CMP_NEQ_UQ,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpneq_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y)
+{
+ return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+ (__v8df) __Y, _CMP_NEQ_UQ,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpnlt_pd_mask (__m512d __X, __m512d __Y)
+{
+ return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+ (__v8df) __Y, _CMP_NLT_US,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpnlt_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y)
+{
+ return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+ (__v8df) __Y, _CMP_NLT_US,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpnle_pd_mask (__m512d __X, __m512d __Y)
+{
+ return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+ (__v8df) __Y, _CMP_NLE_US,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpnle_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y)
+{
+ return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+ (__v8df) __Y, _CMP_NLE_US,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpord_pd_mask (__m512d __X, __m512d __Y)
+{
+ return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+ (__v8df) __Y, _CMP_ORD_Q,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpord_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y)
+{
+ return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+ (__v8df) __Y, _CMP_ORD_Q,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpeq_ps_mask (__m512 __X, __m512 __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+ (__v16sf) __Y, _CMP_EQ_OQ,
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpeq_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+ (__v16sf) __Y, _CMP_EQ_OQ,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmplt_ps_mask (__m512 __X, __m512 __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+ (__v16sf) __Y, _CMP_LT_OS,
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmplt_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+ (__v16sf) __Y, _CMP_LT_OS,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmple_ps_mask (__m512 __X, __m512 __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+ (__v16sf) __Y, _CMP_LE_OS,
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmple_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+ (__v16sf) __Y, _CMP_LE_OS,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpunord_ps_mask (__m512 __X, __m512 __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+ (__v16sf) __Y, _CMP_UNORD_Q,
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpunord_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+ (__v16sf) __Y, _CMP_UNORD_Q,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpneq_ps_mask (__m512 __X, __m512 __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+ (__v16sf) __Y, _CMP_NEQ_UQ,
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpneq_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+ (__v16sf) __Y, _CMP_NEQ_UQ,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpnlt_ps_mask (__m512 __X, __m512 __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+ (__v16sf) __Y, _CMP_NLT_US,
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpnlt_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+ (__v16sf) __Y, _CMP_NLT_US,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpnle_ps_mask (__m512 __X, __m512 __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+ (__v16sf) __Y, _CMP_NLE_US,
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpnle_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+ (__v16sf) __Y, _CMP_NLE_US,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpord_ps_mask (__m512 __X, __m512 __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+ (__v16sf) __Y, _CMP_ORD_Q,
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpord_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+ (__v16sf) __Y, _CMP_ORD_Q,
+ (__mmask16) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_kmov (__mmask16 __A)
+{
+ return __builtin_ia32_kmovw (__A);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castpd_ps (__m512d __A)
+{
+ return (__m512) (__A);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castpd_si512 (__m512d __A)
+{
+ return (__m512i) (__A);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castps_pd (__m512 __A)
+{
+ return (__m512d) (__A);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castps_si512 (__m512 __A)
+{
+ return (__m512i) (__A);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castsi512_ps (__m512i __A)
+{
+ return (__m512) (__A);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castsi512_pd (__m512i __A)
+{
+ return (__m512d) (__A);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castpd512_pd128 (__m512d __A)
+{
+ return (__m128d)_mm512_extractf32x4_ps((__m512)__A, 0);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castps512_ps128 (__m512 __A)
+{
+ return _mm512_extractf32x4_ps(__A, 0);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castsi512_si128 (__m512i __A)
+{
+ return (__m128i)_mm512_extracti32x4_epi32((__m512i)__A, 0);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castpd512_pd256 (__m512d __A)
+{
+ return _mm512_extractf64x4_pd(__A, 0);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castps512_ps256 (__m512 __A)
+{
+ return (__m256)_mm512_extractf64x4_pd((__m512d)__A, 0);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castsi512_si256 (__m512i __A)
+{
+ return (__m256i)_mm512_extractf64x4_pd((__m512d)__A, 0);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castpd128_pd512 (__m128d __A)
+{
+ return (__m512d) __builtin_ia32_pd512_pd((__m128d)__A);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castps128_ps512 (__m128 __A)
+{
+ return (__m512) __builtin_ia32_ps512_ps((__m128)__A);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castsi128_si512 (__m128i __A)
+{
+ return (__m512i) __builtin_ia32_si512_si((__v4si)__A);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castpd256_pd512 (__m256d __A)
+{
+ return __builtin_ia32_pd512_256pd (__A);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castps256_ps512 (__m256 __A)
+{
+ return __builtin_ia32_ps512_256ps (__A);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castsi256_si512 (__m256i __A)
+{
+ return (__m512i)__builtin_ia32_si512_256si ((__v8si)__A);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_zextpd128_pd512 (__m128d __A)
+{
+ return (__m512d) _mm512_insertf32x4 (_mm512_setzero_ps (), (__m128) __A, 0);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_zextps128_ps512 (__m128 __A)
+{
+ return _mm512_insertf32x4 (_mm512_setzero_ps (), __A, 0);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_zextsi128_si512 (__m128i __A)
+{
+ return _mm512_inserti32x4 (_mm512_setzero_si512 (), __A, 0);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_zextpd256_pd512 (__m256d __A)
+{
+ return _mm512_insertf64x4 (_mm512_setzero_pd (), __A, 0);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_zextps256_ps512 (__m256 __A)
+{
+ return (__m512) _mm512_insertf64x4 (_mm512_setzero_pd (), (__m256d) __A, 0);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_zextsi256_si512 (__m256i __A)
+{
+ return _mm512_inserti64x4 (_mm512_setzero_si512 (), __A, 0);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpeq_epu32_mask (__m512i __A, __m512i __B)
+{
+ return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __A,
+ (__v16si) __B, 0,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpeq_epu32_mask (__mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __A,
+ (__v16si) __B, 0, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpeq_epu64_mask (__mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __A,
+ (__v8di) __B, 0, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpeq_epu64_mask (__m512i __A, __m512i __B)
+{
+ return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __A,
+ (__v8di) __B, 0,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpgt_epu32_mask (__m512i __A, __m512i __B)
+{
+ return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __A,
+ (__v16si) __B, 6,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpgt_epu32_mask (__mmask16 __U, __m512i __A, __m512i __B)
+{
+ return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __A,
+ (__v16si) __B, 6, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpgt_epu64_mask (__mmask8 __U, __m512i __A, __m512i __B)
+{
+ return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __A,
+ (__v8di) __B, 6, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpgt_epu64_mask (__m512i __A, __m512i __B)
+{
+ return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __A,
+ (__v8di) __B, 6,
+ (__mmask8) -1);
+}
+
+#undef __MM512_REDUCE_OP
+#define __MM512_REDUCE_OP(op) \
+ __v8si __T1 = (__v8si) _mm512_extracti64x4_epi64 (__A, 1); \
+ __v8si __T2 = (__v8si) _mm512_extracti64x4_epi64 (__A, 0); \
+ __m256i __T3 = (__m256i) (__T1 op __T2); \
+ __v4si __T4 = (__v4si) _mm256_extracti128_si256 (__T3, 1); \
+ __v4si __T5 = (__v4si) _mm256_extracti128_si256 (__T3, 0); \
+ __v4si __T6 = __T4 op __T5; \
+ __v4si __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); \
+ __v4si __T8 = __T6 op __T7; \
+ return __T8[0] op __T8[1]
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_add_epi32 (__m512i __A)
+{
+ __MM512_REDUCE_OP (+);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_mul_epi32 (__m512i __A)
+{
+ __MM512_REDUCE_OP (*);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_and_epi32 (__m512i __A)
+{
+ __MM512_REDUCE_OP (&);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_or_epi32 (__m512i __A)
+{
+ __MM512_REDUCE_OP (|);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_add_epi32 (__mmask16 __U, __m512i __A)
+{
+ __A = _mm512_maskz_mov_epi32 (__U, __A);
+ __MM512_REDUCE_OP (+);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_mul_epi32 (__mmask16 __U, __m512i __A)
+{
+ __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (1), __U, __A);
+ __MM512_REDUCE_OP (*);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_and_epi32 (__mmask16 __U, __m512i __A)
+{
+ __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (~0), __U, __A);
+ __MM512_REDUCE_OP (&);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_or_epi32 (__mmask16 __U, __m512i __A)
+{
+ __A = _mm512_maskz_mov_epi32 (__U, __A);
+ __MM512_REDUCE_OP (|);
+}
+
+#undef __MM512_REDUCE_OP
+#define __MM512_REDUCE_OP(op) \
+ __m256i __T1 = (__m256i) _mm512_extracti64x4_epi64 (__A, 1); \
+ __m256i __T2 = (__m256i) _mm512_extracti64x4_epi64 (__A, 0); \
+ __m256i __T3 = _mm256_##op (__T1, __T2); \
+ __m128i __T4 = (__m128i) _mm256_extracti128_si256 (__T3, 1); \
+ __m128i __T5 = (__m128i) _mm256_extracti128_si256 (__T3, 0); \
+ __m128i __T6 = _mm_##op (__T4, __T5); \
+ __m128i __T7 = (__m128i) __builtin_shuffle ((__v4si) __T6, \
+ (__v4si) { 2, 3, 0, 1 }); \
+ __m128i __T8 = _mm_##op (__T6, __T7); \
+ __m128i __T9 = (__m128i) __builtin_shuffle ((__v4si) __T8, \
+ (__v4si) { 1, 0, 1, 0 }); \
+ __v4si __T10 = (__v4si) _mm_##op (__T8, __T9); \
+ return __T10[0]
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_min_epi32 (__m512i __A)
+{
+ __MM512_REDUCE_OP (min_epi32);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_max_epi32 (__m512i __A)
+{
+ __MM512_REDUCE_OP (max_epi32);
+}
+
+extern __inline unsigned int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_min_epu32 (__m512i __A)
+{
+ __MM512_REDUCE_OP (min_epu32);
+}
+
+extern __inline unsigned int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_max_epu32 (__m512i __A)
+{
+ __MM512_REDUCE_OP (max_epu32);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_min_epi32 (__mmask16 __U, __m512i __A)
+{
+ __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (__INT_MAX__), __U, __A);
+ __MM512_REDUCE_OP (min_epi32);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_max_epi32 (__mmask16 __U, __m512i __A)
+{
+ __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (-__INT_MAX__ - 1), __U, __A);
+ __MM512_REDUCE_OP (max_epi32);
+}
+
+extern __inline unsigned int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_min_epu32 (__mmask16 __U, __m512i __A)
+{
+ __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (~0), __U, __A);
+ __MM512_REDUCE_OP (min_epu32);
+}
+
+extern __inline unsigned int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_max_epu32 (__mmask16 __U, __m512i __A)
+{
+ __A = _mm512_maskz_mov_epi32 (__U, __A);
+ __MM512_REDUCE_OP (max_epu32);
+}
+
+#undef __MM512_REDUCE_OP
+#define __MM512_REDUCE_OP(op) \
+ __m256 __T1 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 1); \
+ __m256 __T2 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 0); \
+ __m256 __T3 = __T1 op __T2; \
+ __m128 __T4 = _mm256_extractf128_ps (__T3, 1); \
+ __m128 __T5 = _mm256_extractf128_ps (__T3, 0); \
+ __m128 __T6 = __T4 op __T5; \
+ __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); \
+ __m128 __T8 = __T6 op __T7; \
+ return __T8[0] op __T8[1]
+
+extern __inline float
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_add_ps (__m512 __A)
+{
+ __MM512_REDUCE_OP (+);
+}
+
+extern __inline float
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_mul_ps (__m512 __A)
+{
+ __MM512_REDUCE_OP (*);
+}
+
+extern __inline float
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_add_ps (__mmask16 __U, __m512 __A)
+{
+ __A = _mm512_maskz_mov_ps (__U, __A);
+ __MM512_REDUCE_OP (+);
+}
+
+extern __inline float
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_mul_ps (__mmask16 __U, __m512 __A)
+{
+ __A = _mm512_mask_mov_ps (_mm512_set1_ps (1.0f), __U, __A);
+ __MM512_REDUCE_OP (*);
+}
+
+#undef __MM512_REDUCE_OP
+#define __MM512_REDUCE_OP(op) \
+ __m256 __T1 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 1); \
+ __m256 __T2 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 0); \
+ __m256 __T3 = _mm256_##op (__T1, __T2); \
+ __m128 __T4 = _mm256_extractf128_ps (__T3, 1); \
+ __m128 __T5 = _mm256_extractf128_ps (__T3, 0); \
+ __m128 __T6 = _mm_##op (__T4, __T5); \
+ __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); \
+ __m128 __T8 = _mm_##op (__T6, __T7); \
+ __m128 __T9 = __builtin_shuffle (__T8, (__v4si) { 1, 0, 1, 0 }); \
+ __m128 __T10 = _mm_##op (__T8, __T9); \
+ return __T10[0]
+
+extern __inline float
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_min_ps (__m512 __A)
+{
+ __MM512_REDUCE_OP (min_ps);
+}
+
+extern __inline float
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_max_ps (__m512 __A)
+{
+ __MM512_REDUCE_OP (max_ps);
+}
+
+extern __inline float
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_min_ps (__mmask16 __U, __m512 __A)
+{
+ __A = _mm512_mask_mov_ps (_mm512_set1_ps (__builtin_inff ()), __U, __A);
+ __MM512_REDUCE_OP (min_ps);
+}
+
+extern __inline float
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_max_ps (__mmask16 __U, __m512 __A)
+{
+ __A = _mm512_mask_mov_ps (_mm512_set1_ps (-__builtin_inff ()), __U, __A);
+ __MM512_REDUCE_OP (max_ps);
+}
+
+#undef __MM512_REDUCE_OP
+#define __MM512_REDUCE_OP(op) \
+ __v4di __T1 = (__v4di) _mm512_extracti64x4_epi64 (__A, 1); \
+ __v4di __T2 = (__v4di) _mm512_extracti64x4_epi64 (__A, 0); \
+ __m256i __T3 = (__m256i) (__T1 op __T2); \
+ __v2di __T4 = (__v2di) _mm256_extracti128_si256 (__T3, 1); \
+ __v2di __T5 = (__v2di) _mm256_extracti128_si256 (__T3, 0); \
+ __v2di __T6 = __T4 op __T5; \
+ return __T6[0] op __T6[1]
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_add_epi64 (__m512i __A)
+{
+ __MM512_REDUCE_OP (+);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_mul_epi64 (__m512i __A)
+{
+ __MM512_REDUCE_OP (*);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_and_epi64 (__m512i __A)
+{
+ __MM512_REDUCE_OP (&);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_or_epi64 (__m512i __A)
+{
+ __MM512_REDUCE_OP (|);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_add_epi64 (__mmask8 __U, __m512i __A)
+{
+ __A = _mm512_maskz_mov_epi64 (__U, __A);
+ __MM512_REDUCE_OP (+);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_mul_epi64 (__mmask8 __U, __m512i __A)
+{
+ __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (1LL), __U, __A);
+ __MM512_REDUCE_OP (*);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_and_epi64 (__mmask8 __U, __m512i __A)
+{
+ __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (~0LL), __U, __A);
+ __MM512_REDUCE_OP (&);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_or_epi64 (__mmask8 __U, __m512i __A)
+{
+ __A = _mm512_maskz_mov_epi64 (__U, __A);
+ __MM512_REDUCE_OP (|);
+}
+
+#undef __MM512_REDUCE_OP
+#define __MM512_REDUCE_OP(op) \
+ __m512i __T1 = _mm512_shuffle_i64x2 (__A, __A, 0x4e); \
+ __m512i __T2 = _mm512_##op (__A, __T1); \
+ __m512i __T3 \
+ = (__m512i) __builtin_shuffle ((__v8di) __T2, \
+ (__v8di) { 2, 3, 0, 1, 6, 7, 4, 5 });\
+ __m512i __T4 = _mm512_##op (__T2, __T3); \
+ __m512i __T5 \
+ = (__m512i) __builtin_shuffle ((__v8di) __T4, \
+ (__v8di) { 1, 0, 3, 2, 5, 4, 7, 6 });\
+ __v8di __T6 = (__v8di) _mm512_##op (__T4, __T5); \
+ return __T6[0]
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_min_epi64 (__m512i __A)
+{
+ __MM512_REDUCE_OP (min_epi64);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_max_epi64 (__m512i __A)
+{
+ __MM512_REDUCE_OP (max_epi64);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_min_epi64 (__mmask8 __U, __m512i __A)
+{
+ __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (__LONG_LONG_MAX__),
+ __U, __A);
+ __MM512_REDUCE_OP (min_epi64);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_max_epi64 (__mmask8 __U, __m512i __A)
+{
+ __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (-__LONG_LONG_MAX__ - 1),
+ __U, __A);
+ __MM512_REDUCE_OP (max_epi64);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_min_epu64 (__m512i __A)
+{
+ __MM512_REDUCE_OP (min_epu64);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_max_epu64 (__m512i __A)
+{
+ __MM512_REDUCE_OP (max_epu64);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_min_epu64 (__mmask8 __U, __m512i __A)
+{
+ __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (~0LL), __U, __A);
+ __MM512_REDUCE_OP (min_epu64);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_max_epu64 (__mmask8 __U, __m512i __A)
+{
+ __A = _mm512_maskz_mov_epi64 (__U, __A);
+ __MM512_REDUCE_OP (max_epu64);
+}
+
+#undef __MM512_REDUCE_OP
+#define __MM512_REDUCE_OP(op) \
+ __m256d __T1 = (__m256d) _mm512_extractf64x4_pd (__A, 1); \
+ __m256d __T2 = (__m256d) _mm512_extractf64x4_pd (__A, 0); \
+ __m256d __T3 = __T1 op __T2; \
+ __m128d __T4 = _mm256_extractf128_pd (__T3, 1); \
+ __m128d __T5 = _mm256_extractf128_pd (__T3, 0); \
+ __m128d __T6 = __T4 op __T5; \
+ return __T6[0] op __T6[1]
+
+extern __inline double
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_add_pd (__m512d __A)
+{
+ __MM512_REDUCE_OP (+);
+}
+
+extern __inline double
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_mul_pd (__m512d __A)
+{
+ __MM512_REDUCE_OP (*);
+}
+
+extern __inline double
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_add_pd (__mmask8 __U, __m512d __A)
+{
+ __A = _mm512_maskz_mov_pd (__U, __A);
+ __MM512_REDUCE_OP (+);
+}
+
+extern __inline double
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_mul_pd (__mmask8 __U, __m512d __A)
+{
+ __A = _mm512_mask_mov_pd (_mm512_set1_pd (1.0), __U, __A);
+ __MM512_REDUCE_OP (*);
+}
+
+#undef __MM512_REDUCE_OP
+#define __MM512_REDUCE_OP(op) \
+ __m256d __T1 = (__m256d) _mm512_extractf64x4_pd (__A, 1); \
+ __m256d __T2 = (__m256d) _mm512_extractf64x4_pd (__A, 0); \
+ __m256d __T3 = _mm256_##op (__T1, __T2); \
+ __m128d __T4 = _mm256_extractf128_pd (__T3, 1); \
+ __m128d __T5 = _mm256_extractf128_pd (__T3, 0); \
+ __m128d __T6 = _mm_##op (__T4, __T5); \
+ __m128d __T7 = (__m128d) __builtin_shuffle (__T6, (__v2di) { 1, 0 }); \
+ __m128d __T8 = _mm_##op (__T6, __T7); \
+ return __T8[0]
+
+extern __inline double
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_min_pd (__m512d __A)
+{
+ __MM512_REDUCE_OP (min_pd);
+}
+
+extern __inline double
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_max_pd (__m512d __A)
+{
+ __MM512_REDUCE_OP (max_pd);
+}
+
+extern __inline double
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_min_pd (__mmask8 __U, __m512d __A)
+{
+ __A = _mm512_mask_mov_pd (_mm512_set1_pd (__builtin_inf ()), __U, __A);
+ __MM512_REDUCE_OP (min_pd);
+}
+
+extern __inline double
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_max_pd (__mmask8 __U, __m512d __A)
+{
+ __A = _mm512_mask_mov_pd (_mm512_set1_pd (-__builtin_inf ()), __U, __A);
+ __MM512_REDUCE_OP (max_pd);
+}
+
+#undef __MM512_REDUCE_OP
+
+#ifdef __DISABLE_AVX512F__
+#undef __DISABLE_AVX512F__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512F__ */
+
+#endif /* _AVX512FINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2019-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512FP16INTRIN_H_INCLUDED
+#define __AVX512FP16INTRIN_H_INCLUDED
+
+#ifndef __AVX512FP16__
+#pragma GCC push_options
+#pragma GCC target("avx512fp16")
+#define __DISABLE_AVX512FP16__
+#endif /* __AVX512FP16__ */
+
+/* Internal data types for implementing the intrinsics. */
+typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16)));
+typedef _Float16 __v16hf __attribute__ ((__vector_size__ (32)));
+typedef _Float16 __v32hf __attribute__ ((__vector_size__ (64)));
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+ vector types, and their scalar components. */
+typedef _Float16 __m128h __attribute__ ((__vector_size__ (16), __may_alias__));
+typedef _Float16 __m256h __attribute__ ((__vector_size__ (32), __may_alias__));
+typedef _Float16 __m512h __attribute__ ((__vector_size__ (64), __may_alias__));
+
+/* Unaligned version of the same type. */
+typedef _Float16 __m128h_u __attribute__ ((__vector_size__ (16), \
+ __may_alias__, __aligned__ (1)));
+typedef _Float16 __m256h_u __attribute__ ((__vector_size__ (32), \
+ __may_alias__, __aligned__ (1)));
+typedef _Float16 __m512h_u __attribute__ ((__vector_size__ (64), \
+ __may_alias__, __aligned__ (1)));
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_ph (_Float16 __A7, _Float16 __A6, _Float16 __A5,
+ _Float16 __A4, _Float16 __A3, _Float16 __A2,
+ _Float16 __A1, _Float16 __A0)
+{
+ return __extension__ (__m128h)(__v8hf){ __A0, __A1, __A2, __A3,
+ __A4, __A5, __A6, __A7 };
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_ph (_Float16 __A15, _Float16 __A14, _Float16 __A13,
+ _Float16 __A12, _Float16 __A11, _Float16 __A10,
+ _Float16 __A9, _Float16 __A8, _Float16 __A7,
+ _Float16 __A6, _Float16 __A5, _Float16 __A4,
+ _Float16 __A3, _Float16 __A2, _Float16 __A1,
+ _Float16 __A0)
+{
+ return __extension__ (__m256h)(__v16hf){ __A0, __A1, __A2, __A3,
+ __A4, __A5, __A6, __A7,
+ __A8, __A9, __A10, __A11,
+ __A12, __A13, __A14, __A15 };
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set_ph (_Float16 __A31, _Float16 __A30, _Float16 __A29,
+ _Float16 __A28, _Float16 __A27, _Float16 __A26,
+ _Float16 __A25, _Float16 __A24, _Float16 __A23,
+ _Float16 __A22, _Float16 __A21, _Float16 __A20,
+ _Float16 __A19, _Float16 __A18, _Float16 __A17,
+ _Float16 __A16, _Float16 __A15, _Float16 __A14,
+ _Float16 __A13, _Float16 __A12, _Float16 __A11,
+ _Float16 __A10, _Float16 __A9, _Float16 __A8,
+ _Float16 __A7, _Float16 __A6, _Float16 __A5,
+ _Float16 __A4, _Float16 __A3, _Float16 __A2,
+ _Float16 __A1, _Float16 __A0)
+{
+ return __extension__ (__m512h)(__v32hf){ __A0, __A1, __A2, __A3,
+ __A4, __A5, __A6, __A7,
+ __A8, __A9, __A10, __A11,
+ __A12, __A13, __A14, __A15,
+ __A16, __A17, __A18, __A19,
+ __A20, __A21, __A22, __A23,
+ __A24, __A25, __A26, __A27,
+ __A28, __A29, __A30, __A31 };
+}
+
+/* Create vectors of elements in the reversed order from _mm_set_ph,
+ _mm256_set_ph and _mm512_set_ph functions. */
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_ph (_Float16 __A0, _Float16 __A1, _Float16 __A2,
+ _Float16 __A3, _Float16 __A4, _Float16 __A5,
+ _Float16 __A6, _Float16 __A7)
+{
+ return _mm_set_ph (__A7, __A6, __A5, __A4, __A3, __A2, __A1, __A0);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_ph (_Float16 __A0, _Float16 __A1, _Float16 __A2,
+ _Float16 __A3, _Float16 __A4, _Float16 __A5,
+ _Float16 __A6, _Float16 __A7, _Float16 __A8,
+ _Float16 __A9, _Float16 __A10, _Float16 __A11,
+ _Float16 __A12, _Float16 __A13, _Float16 __A14,
+ _Float16 __A15)
+{
+ return _mm256_set_ph (__A15, __A14, __A13, __A12, __A11, __A10, __A9,
+ __A8, __A7, __A6, __A5, __A4, __A3, __A2, __A1,
+ __A0);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_setr_ph (_Float16 __A0, _Float16 __A1, _Float16 __A2,
+ _Float16 __A3, _Float16 __A4, _Float16 __A5,
+ _Float16 __A6, _Float16 __A7, _Float16 __A8,
+ _Float16 __A9, _Float16 __A10, _Float16 __A11,
+ _Float16 __A12, _Float16 __A13, _Float16 __A14,
+ _Float16 __A15, _Float16 __A16, _Float16 __A17,
+ _Float16 __A18, _Float16 __A19, _Float16 __A20,
+ _Float16 __A21, _Float16 __A22, _Float16 __A23,
+ _Float16 __A24, _Float16 __A25, _Float16 __A26,
+ _Float16 __A27, _Float16 __A28, _Float16 __A29,
+ _Float16 __A30, _Float16 __A31)
+
+{
+ return _mm512_set_ph (__A31, __A30, __A29, __A28, __A27, __A26, __A25,
+ __A24, __A23, __A22, __A21, __A20, __A19, __A18,
+ __A17, __A16, __A15, __A14, __A13, __A12, __A11,
+ __A10, __A9, __A8, __A7, __A6, __A5, __A4, __A3,
+ __A2, __A1, __A0);
+}
+
+/* Broadcast _Float16 to vector. */
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_ph (_Float16 __A)
+{
+ return _mm_set_ph (__A, __A, __A, __A, __A, __A, __A, __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_ph (_Float16 __A)
+{
+ return _mm256_set_ph (__A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set1_ph (_Float16 __A)
+{
+ return _mm512_set_ph (__A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A);
+}
+
+/* Create a vector with all zeros. */
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setzero_ph (void)
+{
+ return _mm_set1_ph (0.0f16);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setzero_ph (void)
+{
+ return _mm256_set1_ph (0.0f16);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_setzero_ph (void)
+{
+ return _mm512_set1_ph (0.0f16);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_undefined_ph (void)
+{
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Winit-self"
+ __m128h __Y = __Y;
+#pragma GCC diagnostic pop
+ return __Y;
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_undefined_ph (void)
+{
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Winit-self"
+ __m256h __Y = __Y;
+#pragma GCC diagnostic pop
+ return __Y;
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_undefined_ph (void)
+{
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Winit-self"
+ __m512h __Y = __Y;
+#pragma GCC diagnostic pop
+ return __Y;
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsh_h (__m128h __A)
+{
+ return __A[0];
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtsh_h (__m256h __A)
+{
+ return __A[0];
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtsh_h (__m512h __A)
+{
+ return __A[0];
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castph_ps (__m512h __a)
+{
+ return (__m512) __a;
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castph_pd (__m512h __a)
+{
+ return (__m512d) __a;
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castph_si512 (__m512h __a)
+{
+ return (__m512i) __a;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castph512_ph128 (__m512h __A)
+{
+ union
+ {
+ __m128h __a[4];
+ __m512h __v;
+ } __u = { .__v = __A };
+ return __u.__a[0];
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castph512_ph256 (__m512h __A)
+{
+ union
+ {
+ __m256h __a[2];
+ __m512h __v;
+ } __u = { .__v = __A };
+ return __u.__a[0];
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castph128_ph512 (__m128h __A)
+{
+ union
+ {
+ __m128h __a[4];
+ __m512h __v;
+ } __u;
+ __u.__a[0] = __A;
+ return __u.__v;
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castph256_ph512 (__m256h __A)
+{
+ union
+ {
+ __m256h __a[2];
+ __m512h __v;
+ } __u;
+ __u.__a[0] = __A;
+ return __u.__v;
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_zextph128_ph512 (__m128h __A)
+{
+ return (__m512h) _mm512_insertf32x4 (_mm512_setzero_ps (),
+ (__m128) __A, 0);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_zextph256_ph512 (__m256h __A)
+{
+ return (__m512h) _mm512_insertf64x4 (_mm512_setzero_pd (),
+ (__m256d) __A, 0);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castps_ph (__m512 __a)
+{
+ return (__m512h) __a;
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castpd_ph (__m512d __a)
+{
+ return (__m512h) __a;
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castsi512_ph (__m512i __a)
+{
+ return (__m512h) __a;
+}
+
+/* Create a vector with element 0 as F and the rest zero. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_sh (_Float16 __F)
+{
+ return _mm_set_ph (0.0f16, 0.0f16, 0.0f16, 0.0f16, 0.0f16, 0.0f16, 0.0f16,
+ __F);
+}
+
+/* Create a vector with element 0 as *P and the rest zero. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_sh (void const *__P)
+{
+ return _mm_set_ph (0.0f16, 0.0f16, 0.0f16, 0.0f16, 0.0f16, 0.0f16, 0.0f16,
+ *(_Float16 const *) __P);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_load_ph (void const *__P)
+{
+ return *(const __m512h *) __P;
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_load_ph (void const *__P)
+{
+ return *(const __m256h *) __P;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_ph (void const *__P)
+{
+ return *(const __m128h *) __P;
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_loadu_ph (void const *__P)
+{
+ return *(const __m512h_u *) __P;
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu_ph (void const *__P)
+{
+ return *(const __m256h_u *) __P;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_ph (void const *__P)
+{
+ return *(const __m128h_u *) __P;
+}
+
+/* Stores the lower _Float16 value. */
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_sh (void *__P, __m128h __A)
+{
+ *(_Float16 *) __P = ((__v8hf)__A)[0];
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_store_ph (void *__P, __m512h __A)
+{
+ *(__m512h *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_store_ph (void *__P, __m256h __A)
+{
+ *(__m256h *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_ph (void *__P, __m128h __A)
+{
+ *(__m128h *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_storeu_ph (void *__P, __m512h __A)
+{
+ *(__m512h_u *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu_ph (void *__P, __m256h __A)
+{
+ *(__m256h_u *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_ph (void *__P, __m128h __A)
+{
+ *(__m128h_u *) __P = __A;
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_abs_ph (__m512h __A)
+{
+ return (__m512h) _mm512_and_epi32 ( _mm512_set1_epi32 (0x7FFF7FFF),
+ (__m512i) __A);
+}
+
+/* Intrinsics v[add,sub,mul,div]ph. */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_add_ph (__m512h __A, __m512h __B)
+{
+ return (__m512h) ((__v32hf) __A + (__v32hf) __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_add_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
+{
+ return __builtin_ia32_addph512_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_add_ph (__mmask32 __A, __m512h __B, __m512h __C)
+{
+ return __builtin_ia32_addph512_mask (__B, __C,
+ _mm512_setzero_ph (), __A);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sub_ph (__m512h __A, __m512h __B)
+{
+ return (__m512h) ((__v32hf) __A - (__v32hf) __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sub_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
+{
+ return __builtin_ia32_subph512_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sub_ph (__mmask32 __A, __m512h __B, __m512h __C)
+{
+ return __builtin_ia32_subph512_mask (__B, __C,
+ _mm512_setzero_ph (), __A);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mul_ph (__m512h __A, __m512h __B)
+{
+ return (__m512h) ((__v32hf) __A * (__v32hf) __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mul_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
+{
+ return __builtin_ia32_mulph512_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mul_ph (__mmask32 __A, __m512h __B, __m512h __C)
+{
+ return __builtin_ia32_mulph512_mask (__B, __C,
+ _mm512_setzero_ph (), __A);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_div_ph (__m512h __A, __m512h __B)
+{
+ return (__m512h) ((__v32hf) __A / (__v32hf) __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_div_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
+{
+ return __builtin_ia32_divph512_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_div_ph (__mmask32 __A, __m512h __B, __m512h __C)
+{
+ return __builtin_ia32_divph512_mask (__B, __C,
+ _mm512_setzero_ph (), __A);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_add_round_ph (__m512h __A, __m512h __B, const int __C)
+{
+ return __builtin_ia32_addph512_mask_round (__A, __B,
+ _mm512_setzero_ph (),
+ (__mmask32) -1, __C);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_add_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
+ __m512h __D, const int __E)
+{
+ return __builtin_ia32_addph512_mask_round (__C, __D, __A, __B, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_add_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
+ const int __D)
+{
+ return __builtin_ia32_addph512_mask_round (__B, __C,
+ _mm512_setzero_ph (),
+ __A, __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sub_round_ph (__m512h __A, __m512h __B, const int __C)
+{
+ return __builtin_ia32_subph512_mask_round (__A, __B,
+ _mm512_setzero_ph (),
+ (__mmask32) -1, __C);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sub_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
+ __m512h __D, const int __E)
+{
+ return __builtin_ia32_subph512_mask_round (__C, __D, __A, __B, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sub_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
+ const int __D)
+{
+ return __builtin_ia32_subph512_mask_round (__B, __C,
+ _mm512_setzero_ph (),
+ __A, __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mul_round_ph (__m512h __A, __m512h __B, const int __C)
+{
+ return __builtin_ia32_mulph512_mask_round (__A, __B,
+ _mm512_setzero_ph (),
+ (__mmask32) -1, __C);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mul_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
+ __m512h __D, const int __E)
+{
+ return __builtin_ia32_mulph512_mask_round (__C, __D, __A, __B, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mul_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
+ const int __D)
+{
+ return __builtin_ia32_mulph512_mask_round (__B, __C,
+ _mm512_setzero_ph (),
+ __A, __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_div_round_ph (__m512h __A, __m512h __B, const int __C)
+{
+ return __builtin_ia32_divph512_mask_round (__A, __B,
+ _mm512_setzero_ph (),
+ (__mmask32) -1, __C);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_div_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
+ __m512h __D, const int __E)
+{
+ return __builtin_ia32_divph512_mask_round (__C, __D, __A, __B, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_div_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
+ const int __D)
+{
+ return __builtin_ia32_divph512_mask_round (__B, __C,
+ _mm512_setzero_ph (),
+ __A, __D);
+}
+#else
+#define _mm512_add_round_ph(A, B, C) \
+ ((__m512h)__builtin_ia32_addph512_mask_round((A), (B), \
+ _mm512_setzero_ph (), \
+ (__mmask32)-1, (C)))
+
+#define _mm512_mask_add_round_ph(A, B, C, D, E) \
+ ((__m512h)__builtin_ia32_addph512_mask_round((C), (D), (A), (B), (E)))
+
+#define _mm512_maskz_add_round_ph(A, B, C, D) \
+ ((__m512h)__builtin_ia32_addph512_mask_round((B), (C), \
+ _mm512_setzero_ph (), \
+ (A), (D)))
+
+#define _mm512_sub_round_ph(A, B, C) \
+ ((__m512h)__builtin_ia32_subph512_mask_round((A), (B), \
+ _mm512_setzero_ph (), \
+ (__mmask32)-1, (C)))
+
+#define _mm512_mask_sub_round_ph(A, B, C, D, E) \
+ ((__m512h)__builtin_ia32_subph512_mask_round((C), (D), (A), (B), (E)))
+
+#define _mm512_maskz_sub_round_ph(A, B, C, D) \
+ ((__m512h)__builtin_ia32_subph512_mask_round((B), (C), \
+ _mm512_setzero_ph (), \
+ (A), (D)))
+
+#define _mm512_mul_round_ph(A, B, C) \
+ ((__m512h)__builtin_ia32_mulph512_mask_round((A), (B), \
+ _mm512_setzero_ph (), \
+ (__mmask32)-1, (C)))
+
+#define _mm512_mask_mul_round_ph(A, B, C, D, E) \
+ ((__m512h)__builtin_ia32_mulph512_mask_round((C), (D), (A), (B), (E)))
+
+#define _mm512_maskz_mul_round_ph(A, B, C, D) \
+ ((__m512h)__builtin_ia32_mulph512_mask_round((B), (C), \
+ _mm512_setzero_ph (), \
+ (A), (D)))
+
+#define _mm512_div_round_ph(A, B, C) \
+ ((__m512h)__builtin_ia32_divph512_mask_round((A), (B), \
+ _mm512_setzero_ph (), \
+ (__mmask32)-1, (C)))
+
+#define _mm512_mask_div_round_ph(A, B, C, D, E) \
+ ((__m512h)__builtin_ia32_divph512_mask_round((C), (D), (A), (B), (E)))
+
+#define _mm512_maskz_div_round_ph(A, B, C, D) \
+ ((__m512h)__builtin_ia32_divph512_mask_round((B), (C), \
+ _mm512_setzero_ph (), \
+ (A), (D)))
+#endif /* __OPTIMIZE__ */
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_conj_pch (__m512h __A)
+{
+ return (__m512h) _mm512_xor_epi32 ((__m512i) __A, _mm512_set1_epi32 (1<<31));
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_conj_pch (__m512h __W, __mmask16 __U, __m512h __A)
+{
+ return (__m512h)
+ __builtin_ia32_movaps512_mask ((__v16sf) _mm512_conj_pch (__A),
+ (__v16sf) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_conj_pch (__mmask16 __U, __m512h __A)
+{
+ return (__m512h)
+ __builtin_ia32_movaps512_mask ((__v16sf) _mm512_conj_pch (__A),
+ (__v16sf) _mm512_setzero_ps (),
+ (__mmask16) __U);
+}
+
+/* Intrinsics of v[add,sub,mul,div]sh. */
+extern __inline __m128h
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_sh (__m128h __A, __m128h __B)
+{
+ __A[0] += __B[0];
+ return __A;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_add_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return __builtin_ia32_addsh_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_add_sh (__mmask8 __A, __m128h __B, __m128h __C)
+{
+ return __builtin_ia32_addsh_mask (__B, __C, _mm_setzero_ph (),
+ __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_sh (__m128h __A, __m128h __B)
+{
+ __A[0] -= __B[0];
+ return __A;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sub_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return __builtin_ia32_subsh_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sub_sh (__mmask8 __A, __m128h __B, __m128h __C)
+{
+ return __builtin_ia32_subsh_mask (__B, __C, _mm_setzero_ph (),
+ __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_sh (__m128h __A, __m128h __B)
+{
+ __A[0] *= __B[0];
+ return __A;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mul_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return __builtin_ia32_mulsh_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mul_sh (__mmask8 __A, __m128h __B, __m128h __C)
+{
+ return __builtin_ia32_mulsh_mask (__B, __C, _mm_setzero_ph (), __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_sh (__m128h __A, __m128h __B)
+{
+ __A[0] /= __B[0];
+ return __A;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_div_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return __builtin_ia32_divsh_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_div_sh (__mmask8 __A, __m128h __B, __m128h __C)
+{
+ return __builtin_ia32_divsh_mask (__B, __C, _mm_setzero_ph (),
+ __A);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_round_sh (__m128h __A, __m128h __B, const int __C)
+{
+ return __builtin_ia32_addsh_mask_round (__A, __B,
+ _mm_setzero_ph (),
+ (__mmask8) -1, __C);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_add_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
+ __m128h __D, const int __E)
+{
+ return __builtin_ia32_addsh_mask_round (__C, __D, __A, __B, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_add_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
+ const int __D)
+{
+ return __builtin_ia32_addsh_mask_round (__B, __C,
+ _mm_setzero_ph (),
+ __A, __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_round_sh (__m128h __A, __m128h __B, const int __C)
+{
+ return __builtin_ia32_subsh_mask_round (__A, __B,
+ _mm_setzero_ph (),
+ (__mmask8) -1, __C);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sub_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
+ __m128h __D, const int __E)
+{
+ return __builtin_ia32_subsh_mask_round (__C, __D, __A, __B, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sub_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
+ const int __D)
+{
+ return __builtin_ia32_subsh_mask_round (__B, __C,
+ _mm_setzero_ph (),
+ __A, __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_round_sh (__m128h __A, __m128h __B, const int __C)
+{
+ return __builtin_ia32_mulsh_mask_round (__A, __B,
+ _mm_setzero_ph (),
+ (__mmask8) -1, __C);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mul_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
+ __m128h __D, const int __E)
+{
+ return __builtin_ia32_mulsh_mask_round (__C, __D, __A, __B, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mul_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
+ const int __D)
+{
+ return __builtin_ia32_mulsh_mask_round (__B, __C,
+ _mm_setzero_ph (),
+ __A, __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_round_sh (__m128h __A, __m128h __B, const int __C)
+{
+ return __builtin_ia32_divsh_mask_round (__A, __B,
+ _mm_setzero_ph (),
+ (__mmask8) -1, __C);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_div_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
+ __m128h __D, const int __E)
+{
+ return __builtin_ia32_divsh_mask_round (__C, __D, __A, __B, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_div_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
+ const int __D)
+{
+ return __builtin_ia32_divsh_mask_round (__B, __C,
+ _mm_setzero_ph (),
+ __A, __D);
+}
+#else
+#define _mm_add_round_sh(A, B, C) \
+ ((__m128h)__builtin_ia32_addsh_mask_round ((A), (B), \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, (C)))
+
+#define _mm_mask_add_round_sh(A, B, C, D, E) \
+ ((__m128h)__builtin_ia32_addsh_mask_round ((C), (D), (A), (B), (E)))
+
+#define _mm_maskz_add_round_sh(A, B, C, D) \
+ ((__m128h)__builtin_ia32_addsh_mask_round ((B), (C), \
+ _mm_setzero_ph (), \
+ (A), (D)))
+
+#define _mm_sub_round_sh(A, B, C) \
+ ((__m128h)__builtin_ia32_subsh_mask_round ((A), (B), \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, (C)))
+
+#define _mm_mask_sub_round_sh(A, B, C, D, E) \
+ ((__m128h)__builtin_ia32_subsh_mask_round ((C), (D), (A), (B), (E)))
+
+#define _mm_maskz_sub_round_sh(A, B, C, D) \
+ ((__m128h)__builtin_ia32_subsh_mask_round ((B), (C), \
+ _mm_setzero_ph (), \
+ (A), (D)))
+
+#define _mm_mul_round_sh(A, B, C) \
+ ((__m128h)__builtin_ia32_mulsh_mask_round ((A), (B), \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, (C)))
+
+#define _mm_mask_mul_round_sh(A, B, C, D, E) \
+ ((__m128h)__builtin_ia32_mulsh_mask_round ((C), (D), (A), (B), (E)))
+
+#define _mm_maskz_mul_round_sh(A, B, C, D) \
+ ((__m128h)__builtin_ia32_mulsh_mask_round ((B), (C), \
+ _mm_setzero_ph (), \
+ (A), (D)))
+
+#define _mm_div_round_sh(A, B, C) \
+ ((__m128h)__builtin_ia32_divsh_mask_round ((A), (B), \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, (C)))
+
+#define _mm_mask_div_round_sh(A, B, C, D, E) \
+ ((__m128h)__builtin_ia32_divsh_mask_round ((C), (D), (A), (B), (E)))
+
+#define _mm_maskz_div_round_sh(A, B, C, D) \
+ ((__m128h)__builtin_ia32_divsh_mask_round ((B), (C), \
+ _mm_setzero_ph (), \
+ (A), (D)))
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsic vmaxph vminph. */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_ph (__m512h __A, __m512h __B)
+{
+ return __builtin_ia32_maxph512_mask (__A, __B,
+ _mm512_setzero_ph (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
+{
+ return __builtin_ia32_maxph512_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_ph (__mmask32 __A, __m512h __B, __m512h __C)
+{
+ return __builtin_ia32_maxph512_mask (__B, __C,
+ _mm512_setzero_ph (), __A);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_ph (__m512h __A, __m512h __B)
+{
+ return __builtin_ia32_minph512_mask (__A, __B,
+ _mm512_setzero_ph (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
+{
+ return __builtin_ia32_minph512_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_ph (__mmask32 __A, __m512h __B, __m512h __C)
+{
+ return __builtin_ia32_minph512_mask (__B, __C,
+ _mm512_setzero_ph (), __A);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_round_ph (__m512h __A, __m512h __B, const int __C)
+{
+ return __builtin_ia32_maxph512_mask_round (__A, __B,
+ _mm512_setzero_ph (),
+ (__mmask32) -1, __C);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
+ __m512h __D, const int __E)
+{
+ return __builtin_ia32_maxph512_mask_round (__C, __D, __A, __B, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
+ const int __D)
+{
+ return __builtin_ia32_maxph512_mask_round (__B, __C,
+ _mm512_setzero_ph (),
+ __A, __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_round_ph (__m512h __A, __m512h __B, const int __C)
+{
+ return __builtin_ia32_minph512_mask_round (__A, __B,
+ _mm512_setzero_ph (),
+ (__mmask32) -1, __C);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
+ __m512h __D, const int __E)
+{
+ return __builtin_ia32_minph512_mask_round (__C, __D, __A, __B, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
+ const int __D)
+{
+ return __builtin_ia32_minph512_mask_round (__B, __C,
+ _mm512_setzero_ph (),
+ __A, __D);
+}
+
+#else
+#define _mm512_max_round_ph(A, B, C) \
+ (__builtin_ia32_maxph512_mask_round ((A), (B), \
+ _mm512_setzero_ph (), \
+ (__mmask32)-1, (C)))
+
+#define _mm512_mask_max_round_ph(A, B, C, D, E) \
+ (__builtin_ia32_maxph512_mask_round ((C), (D), (A), (B), (E)))
+
+#define _mm512_maskz_max_round_ph(A, B, C, D) \
+ (__builtin_ia32_maxph512_mask_round ((B), (C), \
+ _mm512_setzero_ph (), \
+ (A), (D)))
+
+#define _mm512_min_round_ph(A, B, C) \
+ (__builtin_ia32_minph512_mask_round ((A), (B), \
+ _mm512_setzero_ph (), \
+ (__mmask32)-1, (C)))
+
+#define _mm512_mask_min_round_ph(A, B, C, D, E) \
+ (__builtin_ia32_minph512_mask_round ((C), (D), (A), (B), (E)))
+
+#define _mm512_maskz_min_round_ph(A, B, C, D) \
+ (__builtin_ia32_minph512_mask_round ((B), (C), \
+ _mm512_setzero_ph (), \
+ (A), (D)))
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsic vmaxsh vminsh. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_sh (__m128h __A, __m128h __B)
+{
+ __A[0] = __A[0] > __B[0] ? __A[0] : __B[0];
+ return __A;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return __builtin_ia32_maxsh_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_sh (__mmask8 __A, __m128h __B, __m128h __C)
+{
+ return __builtin_ia32_maxsh_mask (__B, __C, _mm_setzero_ph (),
+ __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_sh (__m128h __A, __m128h __B)
+{
+ __A[0] = __A[0] < __B[0] ? __A[0] : __B[0];
+ return __A;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return __builtin_ia32_minsh_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_sh (__mmask8 __A, __m128h __B, __m128h __C)
+{
+ return __builtin_ia32_minsh_mask (__B, __C, _mm_setzero_ph (),
+ __A);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_round_sh (__m128h __A, __m128h __B, const int __C)
+{
+ return __builtin_ia32_maxsh_mask_round (__A, __B,
+ _mm_setzero_ph (),
+ (__mmask8) -1, __C);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
+ __m128h __D, const int __E)
+{
+ return __builtin_ia32_maxsh_mask_round (__C, __D, __A, __B, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
+ const int __D)
+{
+ return __builtin_ia32_maxsh_mask_round (__B, __C,
+ _mm_setzero_ph (),
+ __A, __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_round_sh (__m128h __A, __m128h __B, const int __C)
+{
+ return __builtin_ia32_minsh_mask_round (__A, __B,
+ _mm_setzero_ph (),
+ (__mmask8) -1, __C);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
+ __m128h __D, const int __E)
+{
+ return __builtin_ia32_minsh_mask_round (__C, __D, __A, __B, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
+ const int __D)
+{
+ return __builtin_ia32_minsh_mask_round (__B, __C,
+ _mm_setzero_ph (),
+ __A, __D);
+}
+
+#else
+#define _mm_max_round_sh(A, B, C) \
+ (__builtin_ia32_maxsh_mask_round ((A), (B), \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, (C)))
+
+#define _mm_mask_max_round_sh(A, B, C, D, E) \
+ (__builtin_ia32_maxsh_mask_round ((C), (D), (A), (B), (E)))
+
+#define _mm_maskz_max_round_sh(A, B, C, D) \
+ (__builtin_ia32_maxsh_mask_round ((B), (C), \
+ _mm_setzero_ph (), \
+ (A), (D)))
+
+#define _mm_min_round_sh(A, B, C) \
+ (__builtin_ia32_minsh_mask_round ((A), (B), \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, (C)))
+
+#define _mm_mask_min_round_sh(A, B, C, D, E) \
+ (__builtin_ia32_minsh_mask_round ((C), (D), (A), (B), (E)))
+
+#define _mm_maskz_min_round_sh(A, B, C, D) \
+ (__builtin_ia32_minsh_mask_round ((B), (C), \
+ _mm_setzero_ph (), \
+ (A), (D)))
+
+#endif /* __OPTIMIZE__ */
+
+/* vcmpph */
+#ifdef __OPTIMIZE
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_ph_mask (__m512h __A, __m512h __B, const int __C)
+{
+ return (__mmask32) __builtin_ia32_cmpph512_mask (__A, __B, __C,
+ (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_ph_mask (__mmask32 __A, __m512h __B, __m512h __C,
+ const int __D)
+{
+ return (__mmask32) __builtin_ia32_cmpph512_mask (__B, __C, __D,
+ __A);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_round_ph_mask (__m512h __A, __m512h __B, const int __C,
+ const int __D)
+{
+ return (__mmask32) __builtin_ia32_cmpph512_mask_round (__A, __B,
+ __C, (__mmask32) -1,
+ __D);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_round_ph_mask (__mmask32 __A, __m512h __B, __m512h __C,
+ const int __D, const int __E)
+{
+ return (__mmask32) __builtin_ia32_cmpph512_mask_round (__B, __C,
+ __D, __A,
+ __E);
+}
+
+#else
+#define _mm512_cmp_ph_mask(A, B, C) \
+ (__builtin_ia32_cmpph512_mask ((A), (B), (C), (-1)))
+
+#define _mm512_mask_cmp_ph_mask(A, B, C, D) \
+ (__builtin_ia32_cmpph512_mask ((B), (C), (D), (A)))
+
+#define _mm512_cmp_round_ph_mask(A, B, C, D) \
+ (__builtin_ia32_cmpph512_mask_round ((A), (B), (C), (-1), (D)))
+
+#define _mm512_mask_cmp_round_ph_mask(A, B, C, D, E) \
+ (__builtin_ia32_cmpph512_mask_round ((B), (C), (D), (A), (E)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcmpsh. */
+#ifdef __OPTIMIZE__
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_sh_mask (__m128h __A, __m128h __B, const int __C)
+{
+ return (__mmask8)
+ __builtin_ia32_cmpsh_mask_round (__A, __B,
+ __C, (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_sh_mask (__mmask8 __A, __m128h __B, __m128h __C,
+ const int __D)
+{
+ return (__mmask8)
+ __builtin_ia32_cmpsh_mask_round (__B, __C,
+ __D, __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_round_sh_mask (__m128h __A, __m128h __B, const int __C,
+ const int __D)
+{
+ return (__mmask8) __builtin_ia32_cmpsh_mask_round (__A, __B,
+ __C, (__mmask8) -1,
+ __D);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_round_sh_mask (__mmask8 __A, __m128h __B, __m128h __C,
+ const int __D, const int __E)
+{
+ return (__mmask8) __builtin_ia32_cmpsh_mask_round (__B, __C,
+ __D, __A,
+ __E);
+}
+
+#else
+#define _mm_cmp_sh_mask(A, B, C) \
+ (__builtin_ia32_cmpsh_mask_round ((A), (B), (C), (-1), \
+ (_MM_FROUND_CUR_DIRECTION)))
+
+#define _mm_mask_cmp_sh_mask(A, B, C, D) \
+ (__builtin_ia32_cmpsh_mask_round ((B), (C), (D), (A), \
+ (_MM_FROUND_CUR_DIRECTION)))
+
+#define _mm_cmp_round_sh_mask(A, B, C, D) \
+ (__builtin_ia32_cmpsh_mask_round ((A), (B), (C), (-1), (D)))
+
+#define _mm_mask_cmp_round_sh_mask(A, B, C, D, E) \
+ (__builtin_ia32_cmpsh_mask_round ((B), (C), (D), (A), (E)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcomish. */
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comieq_sh (__m128h __A, __m128h __B)
+{
+ return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_EQ_OS,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comilt_sh (__m128h __A, __m128h __B)
+{
+ return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_LT_OS,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comile_sh (__m128h __A, __m128h __B)
+{
+ return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_LE_OS,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comigt_sh (__m128h __A, __m128h __B)
+{
+ return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_GT_OS,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comige_sh (__m128h __A, __m128h __B)
+{
+ return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_GE_OS,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comineq_sh (__m128h __A, __m128h __B)
+{
+ return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_NEQ_US,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomieq_sh (__m128h __A, __m128h __B)
+{
+ return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_EQ_OQ,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomilt_sh (__m128h __A, __m128h __B)
+{
+ return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_LT_OQ,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomile_sh (__m128h __A, __m128h __B)
+{
+ return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_LE_OQ,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomigt_sh (__m128h __A, __m128h __B)
+{
+ return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_GT_OQ,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomige_sh (__m128h __A, __m128h __B)
+{
+ return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_GE_OQ,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomineq_sh (__m128h __A, __m128h __B)
+{
+ return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_NEQ_UQ,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comi_sh (__m128h __A, __m128h __B, const int __P)
+{
+ return __builtin_ia32_cmpsh_mask_round (__A, __B, __P,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comi_round_sh (__m128h __A, __m128h __B, const int __P, const int __R)
+{
+ return __builtin_ia32_cmpsh_mask_round (__A, __B, __P,
+ (__mmask8) -1,__R);
+}
+
+#else
+#define _mm_comi_round_sh(A, B, P, R) \
+ (__builtin_ia32_cmpsh_mask_round ((A), (B), (P), (__mmask8) (-1), (R)))
+#define _mm_comi_sh(A, B, P) \
+ (__builtin_ia32_cmpsh_mask_round ((A), (B), (P), (__mmask8) (-1), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vsqrtph. */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sqrt_ph (__m512h __A)
+{
+ return __builtin_ia32_sqrtph512_mask_round (__A,
+ _mm512_setzero_ph(),
+ (__mmask32) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sqrt_ph (__m512h __A, __mmask32 __B, __m512h __C)
+{
+ return __builtin_ia32_sqrtph512_mask_round (__C, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sqrt_ph (__mmask32 __A, __m512h __B)
+{
+ return __builtin_ia32_sqrtph512_mask_round (__B,
+ _mm512_setzero_ph (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sqrt_round_ph (__m512h __A, const int __B)
+{
+ return __builtin_ia32_sqrtph512_mask_round (__A,
+ _mm512_setzero_ph(),
+ (__mmask32) -1, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sqrt_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
+ const int __D)
+{
+ return __builtin_ia32_sqrtph512_mask_round (__C, __A, __B, __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sqrt_round_ph (__mmask32 __A, __m512h __B, const int __C)
+{
+ return __builtin_ia32_sqrtph512_mask_round (__B,
+ _mm512_setzero_ph (),
+ __A, __C);
+}
+
+#else
+#define _mm512_sqrt_round_ph(A, B) \
+ (__builtin_ia32_sqrtph512_mask_round ((A), \
+ _mm512_setzero_ph (), \
+ (__mmask32)-1, (B)))
+
+#define _mm512_mask_sqrt_round_ph(A, B, C, D) \
+ (__builtin_ia32_sqrtph512_mask_round ((C), (A), (B), (D)))
+
+#define _mm512_maskz_sqrt_round_ph(A, B, C) \
+ (__builtin_ia32_sqrtph512_mask_round ((B), \
+ _mm512_setzero_ph (), \
+ (A), (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vrsqrtph. */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rsqrt_ph (__m512h __A)
+{
+ return __builtin_ia32_rsqrtph512_mask (__A, _mm512_setzero_ph (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rsqrt_ph (__m512h __A, __mmask32 __B, __m512h __C)
+{
+ return __builtin_ia32_rsqrtph512_mask (__C, __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rsqrt_ph (__mmask32 __A, __m512h __B)
+{
+ return __builtin_ia32_rsqrtph512_mask (__B, _mm512_setzero_ph (),
+ __A);
+}
+
+/* Intrinsics vrsqrtsh. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt_sh (__m128h __A, __m128h __B)
+{
+ return __builtin_ia32_rsqrtsh_mask (__B, __A, _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rsqrt_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return __builtin_ia32_rsqrtsh_mask (__D, __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rsqrt_sh (__mmask8 __A, __m128h __B, __m128h __C)
+{
+ return __builtin_ia32_rsqrtsh_mask (__C, __B, _mm_setzero_ph (),
+ __A);
+}
+
+/* Intrinsics vsqrtsh. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_sh (__m128h __A, __m128h __B)
+{
+ return __builtin_ia32_sqrtsh_mask_round (__B, __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sqrt_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return __builtin_ia32_sqrtsh_mask_round (__D, __C, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sqrt_sh (__mmask8 __A, __m128h __B, __m128h __C)
+{
+ return __builtin_ia32_sqrtsh_mask_round (__C, __B,
+ _mm_setzero_ph (),
+ __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_round_sh (__m128h __A, __m128h __B, const int __C)
+{
+ return __builtin_ia32_sqrtsh_mask_round (__B, __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1, __C);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sqrt_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
+ __m128h __D, const int __E)
+{
+ return __builtin_ia32_sqrtsh_mask_round (__D, __C, __A, __B,
+ __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sqrt_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
+ const int __D)
+{
+ return __builtin_ia32_sqrtsh_mask_round (__C, __B,
+ _mm_setzero_ph (),
+ __A, __D);
+}
+
+#else
+#define _mm_sqrt_round_sh(A, B, C) \
+ (__builtin_ia32_sqrtsh_mask_round ((B), (A), \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, (C)))
+
+#define _mm_mask_sqrt_round_sh(A, B, C, D, E) \
+ (__builtin_ia32_sqrtsh_mask_round ((D), (C), (A), (B), (E)))
+
+#define _mm_maskz_sqrt_round_sh(A, B, C, D) \
+ (__builtin_ia32_sqrtsh_mask_round ((C), (B), \
+ _mm_setzero_ph (), \
+ (A), (D)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vrcpph. */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rcp_ph (__m512h __A)
+{
+ return __builtin_ia32_rcpph512_mask (__A, _mm512_setzero_ph (),
+ (__mmask32) -1);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rcp_ph (__m512h __A, __mmask32 __B, __m512h __C)
+{
+ return __builtin_ia32_rcpph512_mask (__C, __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rcp_ph (__mmask32 __A, __m512h __B)
+{
+ return __builtin_ia32_rcpph512_mask (__B, _mm512_setzero_ph (),
+ __A);
+}
+
+/* Intrinsics vrcpsh. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp_sh (__m128h __A, __m128h __B)
+{
+ return __builtin_ia32_rcpsh_mask (__B, __A, _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rcp_sh (__m128h __A, __mmask32 __B, __m128h __C, __m128h __D)
+{
+ return __builtin_ia32_rcpsh_mask (__D, __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rcp_sh (__mmask32 __A, __m128h __B, __m128h __C)
+{
+ return __builtin_ia32_rcpsh_mask (__C, __B, _mm_setzero_ph (),
+ __A);
+}
+
+/* Intrinsics vscalefph. */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_scalef_ph (__m512h __A, __m512h __B)
+{
+ return __builtin_ia32_scalefph512_mask_round (__A, __B,
+ _mm512_setzero_ph (),
+ (__mmask32) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_scalef_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
+{
+ return __builtin_ia32_scalefph512_mask_round (__C, __D, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_scalef_ph (__mmask32 __A, __m512h __B, __m512h __C)
+{
+ return __builtin_ia32_scalefph512_mask_round (__B, __C,
+ _mm512_setzero_ph (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_scalef_round_ph (__m512h __A, __m512h __B, const int __C)
+{
+ return __builtin_ia32_scalefph512_mask_round (__A, __B,
+ _mm512_setzero_ph (),
+ (__mmask32) -1, __C);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_scalef_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
+ __m512h __D, const int __E)
+{
+ return __builtin_ia32_scalefph512_mask_round (__C, __D, __A, __B,
+ __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_scalef_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
+ const int __D)
+{
+ return __builtin_ia32_scalefph512_mask_round (__B, __C,
+ _mm512_setzero_ph (),
+ __A, __D);
+}
+
+#else
+#define _mm512_scalef_round_ph(A, B, C) \
+ (__builtin_ia32_scalefph512_mask_round ((A), (B), \
+ _mm512_setzero_ph (), \
+ (__mmask32)-1, (C)))
+
+#define _mm512_mask_scalef_round_ph(A, B, C, D, E) \
+ (__builtin_ia32_scalefph512_mask_round ((C), (D), (A), (B), (E)))
+
+#define _mm512_maskz_scalef_round_ph(A, B, C, D) \
+ (__builtin_ia32_scalefph512_mask_round ((B), (C), \
+ _mm512_setzero_ph (), \
+ (A), (D)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vscalefsh. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_scalef_sh (__m128h __A, __m128h __B)
+{
+ return __builtin_ia32_scalefsh_mask_round (__A, __B,
+ _mm_setzero_ph (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_scalef_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return __builtin_ia32_scalefsh_mask_round (__C, __D, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_scalef_sh (__mmask8 __A, __m128h __B, __m128h __C)
+{
+ return __builtin_ia32_scalefsh_mask_round (__B, __C,
+ _mm_setzero_ph (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_scalef_round_sh (__m128h __A, __m128h __B, const int __C)
+{
+ return __builtin_ia32_scalefsh_mask_round (__A, __B,
+ _mm_setzero_ph (),
+ (__mmask8) -1, __C);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_scalef_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
+ __m128h __D, const int __E)
+{
+ return __builtin_ia32_scalefsh_mask_round (__C, __D, __A, __B,
+ __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_scalef_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
+ const int __D)
+{
+ return __builtin_ia32_scalefsh_mask_round (__B, __C,
+ _mm_setzero_ph (),
+ __A, __D);
+}
+
+#else
+#define _mm_scalef_round_sh(A, B, C) \
+ (__builtin_ia32_scalefsh_mask_round ((A), (B), \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, (C)))
+
+#define _mm_mask_scalef_round_sh(A, B, C, D, E) \
+ (__builtin_ia32_scalefsh_mask_round ((C), (D), (A), (B), (E)))
+
+#define _mm_maskz_scalef_round_sh(A, B, C, D) \
+ (__builtin_ia32_scalefsh_mask_round ((B), (C), _mm_setzero_ph (), \
+ (A), (D)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vreduceph. */
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_ph (__m512h __A, int __B)
+{
+ return __builtin_ia32_reduceph512_mask_round (__A, __B,
+ _mm512_setzero_ph (),
+ (__mmask32) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_ph (__m512h __A, __mmask32 __B, __m512h __C, int __D)
+{
+ return __builtin_ia32_reduceph512_mask_round (__C, __D, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_reduce_ph (__mmask32 __A, __m512h __B, int __C)
+{
+ return __builtin_ia32_reduceph512_mask_round (__B, __C,
+ _mm512_setzero_ph (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_round_ph (__m512h __A, int __B, const int __C)
+{
+ return __builtin_ia32_reduceph512_mask_round (__A, __B,
+ _mm512_setzero_ph (),
+ (__mmask32) -1, __C);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
+ int __D, const int __E)
+{
+ return __builtin_ia32_reduceph512_mask_round (__C, __D, __A, __B,
+ __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_reduce_round_ph (__mmask32 __A, __m512h __B, int __C,
+ const int __D)
+{
+ return __builtin_ia32_reduceph512_mask_round (__B, __C,
+ _mm512_setzero_ph (),
+ __A, __D);
+}
+
+#else
+#define _mm512_reduce_ph(A, B) \
+ (__builtin_ia32_reduceph512_mask_round ((A), (B), \
+ _mm512_setzero_ph (), \
+ (__mmask32)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_reduce_ph(A, B, C, D) \
+ (__builtin_ia32_reduceph512_mask_round ((C), (D), (A), (B), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_reduce_ph(A, B, C) \
+ (__builtin_ia32_reduceph512_mask_round ((B), (C), \
+ _mm512_setzero_ph (), \
+ (A), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_reduce_round_ph(A, B, C) \
+ (__builtin_ia32_reduceph512_mask_round ((A), (B), \
+ _mm512_setzero_ph (), \
+ (__mmask32)-1, (C)))
+
+#define _mm512_mask_reduce_round_ph(A, B, C, D, E) \
+ (__builtin_ia32_reduceph512_mask_round ((C), (D), (A), (B), (E)))
+
+#define _mm512_maskz_reduce_round_ph(A, B, C, D) \
+ (__builtin_ia32_reduceph512_mask_round ((B), (C), \
+ _mm512_setzero_ph (), \
+ (A), (D)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vreducesh. */
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_sh (__m128h __A, __m128h __B, int __C)
+{
+ return __builtin_ia32_reducesh_mask_round (__A, __B, __C,
+ _mm_setzero_ph (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_sh (__m128h __A, __mmask8 __B, __m128h __C,
+ __m128h __D, int __E)
+{
+ return __builtin_ia32_reducesh_mask_round (__C, __D, __E, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_reduce_sh (__mmask8 __A, __m128h __B, __m128h __C, int __D)
+{
+ return __builtin_ia32_reducesh_mask_round (__B, __C, __D,
+ _mm_setzero_ph (), __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_round_sh (__m128h __A, __m128h __B, int __C, const int __D)
+{
+ return __builtin_ia32_reducesh_mask_round (__A, __B, __C,
+ _mm_setzero_ph (),
+ (__mmask8) -1, __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
+ __m128h __D, int __E, const int __F)
+{
+ return __builtin_ia32_reducesh_mask_round (__C, __D, __E, __A,
+ __B, __F);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_reduce_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
+ int __D, const int __E)
+{
+ return __builtin_ia32_reducesh_mask_round (__B, __C, __D,
+ _mm_setzero_ph (),
+ __A, __E);
+}
+
+#else
+#define _mm_reduce_sh(A, B, C) \
+ (__builtin_ia32_reducesh_mask_round ((A), (B), (C), \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_reduce_sh(A, B, C, D, E) \
+ (__builtin_ia32_reducesh_mask_round ((C), (D), (E), (A), (B), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_reduce_sh(A, B, C, D) \
+ (__builtin_ia32_reducesh_mask_round ((B), (C), (D), \
+ _mm_setzero_ph (), \
+ (A), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_reduce_round_sh(A, B, C, D) \
+ (__builtin_ia32_reducesh_mask_round ((A), (B), (C), \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, (D)))
+
+#define _mm_mask_reduce_round_sh(A, B, C, D, E, F) \
+ (__builtin_ia32_reducesh_mask_round ((C), (D), (E), (A), (B), (F)))
+
+#define _mm_maskz_reduce_round_sh(A, B, C, D, E) \
+ (__builtin_ia32_reducesh_mask_round ((B), (C), (D), \
+ _mm_setzero_ph (), \
+ (A), (E)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vrndscaleph. */
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_roundscale_ph (__m512h __A, int __B)
+{
+ return __builtin_ia32_rndscaleph512_mask_round (__A, __B,
+ _mm512_setzero_ph (),
+ (__mmask32) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_roundscale_ph (__m512h __A, __mmask32 __B,
+ __m512h __C, int __D)
+{
+ return __builtin_ia32_rndscaleph512_mask_round (__C, __D, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_roundscale_ph (__mmask32 __A, __m512h __B, int __C)
+{
+ return __builtin_ia32_rndscaleph512_mask_round (__B, __C,
+ _mm512_setzero_ph (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_roundscale_round_ph (__m512h __A, int __B, const int __C)
+{
+ return __builtin_ia32_rndscaleph512_mask_round (__A, __B,
+ _mm512_setzero_ph (),
+ (__mmask32) -1,
+ __C);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_roundscale_round_ph (__m512h __A, __mmask32 __B,
+ __m512h __C, int __D, const int __E)
+{
+ return __builtin_ia32_rndscaleph512_mask_round (__C, __D, __A,
+ __B, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_roundscale_round_ph (__mmask32 __A, __m512h __B, int __C,
+ const int __D)
+{
+ return __builtin_ia32_rndscaleph512_mask_round (__B, __C,
+ _mm512_setzero_ph (),
+ __A, __D);
+}
+
+#else
+#define _mm512_roundscale_ph(A, B) \
+ (__builtin_ia32_rndscaleph512_mask_round ((A), (B), \
+ _mm512_setzero_ph (), \
+ (__mmask32)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_roundscale_ph(A, B, C, D) \
+ (__builtin_ia32_rndscaleph512_mask_round ((C), (D), (A), (B), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_roundscale_ph(A, B, C) \
+ (__builtin_ia32_rndscaleph512_mask_round ((B), (C), \
+ _mm512_setzero_ph (), \
+ (A), \
+ _MM_FROUND_CUR_DIRECTION))
+#define _mm512_roundscale_round_ph(A, B, C) \
+ (__builtin_ia32_rndscaleph512_mask_round ((A), (B), \
+ _mm512_setzero_ph (), \
+ (__mmask32)-1, (C)))
+
+#define _mm512_mask_roundscale_round_ph(A, B, C, D, E) \
+ (__builtin_ia32_rndscaleph512_mask_round ((C), (D), (A), (B), (E)))
+
+#define _mm512_maskz_roundscale_round_ph(A, B, C, D) \
+ (__builtin_ia32_rndscaleph512_mask_round ((B), (C), \
+ _mm512_setzero_ph (), \
+ (A), (D)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vrndscalesh. */
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roundscale_sh (__m128h __A, __m128h __B, int __C)
+{
+ return __builtin_ia32_rndscalesh_mask_round (__A, __B, __C,
+ _mm_setzero_ph (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_roundscale_sh (__m128h __A, __mmask8 __B, __m128h __C,
+ __m128h __D, int __E)
+{
+ return __builtin_ia32_rndscalesh_mask_round (__C, __D, __E, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_roundscale_sh (__mmask8 __A, __m128h __B, __m128h __C, int __D)
+{
+ return __builtin_ia32_rndscalesh_mask_round (__B, __C, __D,
+ _mm_setzero_ph (), __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roundscale_round_sh (__m128h __A, __m128h __B, int __C, const int __D)
+{
+ return __builtin_ia32_rndscalesh_mask_round (__A, __B, __C,
+ _mm_setzero_ph (),
+ (__mmask8) -1,
+ __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_roundscale_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
+ __m128h __D, int __E, const int __F)
+{
+ return __builtin_ia32_rndscalesh_mask_round (__C, __D, __E,
+ __A, __B, __F);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_roundscale_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
+ int __D, const int __E)
+{
+ return __builtin_ia32_rndscalesh_mask_round (__B, __C, __D,
+ _mm_setzero_ph (),
+ __A, __E);
+}
+
+#else
+#define _mm_roundscale_sh(A, B, C) \
+ (__builtin_ia32_rndscalesh_mask_round ((A), (B), (C), \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_roundscale_sh(A, B, C, D, E) \
+ (__builtin_ia32_rndscalesh_mask_round ((C), (D), (E), (A), (B), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_roundscale_sh(A, B, C, D) \
+ (__builtin_ia32_rndscalesh_mask_round ((B), (C), (D), \
+ _mm_setzero_ph (), \
+ (A), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_roundscale_round_sh(A, B, C, D) \
+ (__builtin_ia32_rndscalesh_mask_round ((A), (B), (C), \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, (D)))
+
+#define _mm_mask_roundscale_round_sh(A, B, C, D, E, F) \
+ (__builtin_ia32_rndscalesh_mask_round ((C), (D), (E), (A), (B), (F)))
+
+#define _mm_maskz_roundscale_round_sh(A, B, C, D, E) \
+ (__builtin_ia32_rndscalesh_mask_round ((B), (C), (D), \
+ _mm_setzero_ph (), \
+ (A), (E)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vfpclasssh. */
+#ifdef __OPTIMIZE__
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fpclass_sh_mask (__m128h __A, const int __imm)
+{
+ return (__mmask8) __builtin_ia32_fpclasssh_mask ((__v8hf) __A, __imm,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fpclass_sh_mask (__mmask8 __U, __m128h __A, const int __imm)
+{
+ return (__mmask8) __builtin_ia32_fpclasssh_mask ((__v8hf) __A, __imm, __U);
+}
+
+#else
+#define _mm_fpclass_sh_mask(X, C) \
+ ((__mmask8) __builtin_ia32_fpclasssh_mask ((__v8hf) (__m128h) (X), \
+ (int) (C), (__mmask8) (-1))) \
+
+#define _mm_mask_fpclass_sh_mask(U, X, C) \
+ ((__mmask8) __builtin_ia32_fpclasssh_mask ((__v8hf) (__m128h) (X), \
+ (int) (C), (__mmask8) (U)))
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vfpclassph. */
+#ifdef __OPTIMIZE__
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fpclass_ph_mask (__mmask32 __U, __m512h __A,
+ const int __imm)
+{
+ return (__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) __A,
+ __imm, __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fpclass_ph_mask (__m512h __A, const int __imm)
+{
+ return (__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) __A,
+ __imm,
+ (__mmask32) -1);
+}
+
+#else
+#define _mm512_mask_fpclass_ph_mask(u, x, c) \
+ ((__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) (__m512h) (x), \
+ (int) (c),(__mmask8)(u)))
+
+#define _mm512_fpclass_ph_mask(x, c) \
+ ((__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) (__m512h) (x), \
+ (int) (c),(__mmask8)-1))
+#endif /* __OPIMTIZE__ */
+
+/* Intrinsics vgetexpph, vgetexpsh. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getexp_sh (__m128h __A, __m128h __B)
+{
+ return (__m128h)
+ __builtin_ia32_getexpsh_mask_round ((__v8hf) __A, (__v8hf) __B,
+ (__v8hf) _mm_setzero_ph (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getexp_sh (__m128h __W, __mmask8 __U, __m128h __A, __m128h __B)
+{
+ return (__m128h)
+ __builtin_ia32_getexpsh_mask_round ((__v8hf) __A, (__v8hf) __B,
+ (__v8hf) __W, (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getexp_sh (__mmask8 __U, __m128h __A, __m128h __B)
+{
+ return (__m128h)
+ __builtin_ia32_getexpsh_mask_round ((__v8hf) __A, (__v8hf) __B,
+ (__v8hf) _mm_setzero_ph (),
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getexp_ph (__m512h __A)
+{
+ return (__m512h)
+ __builtin_ia32_getexpph512_mask ((__v32hf) __A,
+ (__v32hf) _mm512_setzero_ph (),
+ (__mmask32) -1, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getexp_ph (__m512h __W, __mmask32 __U, __m512h __A)
+{
+ return (__m512h)
+ __builtin_ia32_getexpph512_mask ((__v32hf) __A, (__v32hf) __W,
+ (__mmask32) __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getexp_ph (__mmask32 __U, __m512h __A)
+{
+ return (__m512h)
+ __builtin_ia32_getexpph512_mask ((__v32hf) __A,
+ (__v32hf) _mm512_setzero_ph (),
+ (__mmask32) __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getexp_round_sh (__m128h __A, __m128h __B, const int __R)
+{
+ return (__m128h) __builtin_ia32_getexpsh_mask_round ((__v8hf) __A,
+ (__v8hf) __B,
+ _mm_setzero_ph (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getexp_round_sh (__m128h __W, __mmask8 __U, __m128h __A,
+ __m128h __B, const int __R)
+{
+ return (__m128h) __builtin_ia32_getexpsh_mask_round ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __W,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getexp_round_sh (__mmask8 __U, __m128h __A, __m128h __B,
+ const int __R)
+{
+ return (__m128h) __builtin_ia32_getexpsh_mask_round ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf)
+ _mm_setzero_ph (),
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getexp_round_ph (__m512h __A, const int __R)
+{
+ return (__m512h) __builtin_ia32_getexpph512_mask ((__v32hf) __A,
+ (__v32hf)
+ _mm512_setzero_ph (),
+ (__mmask32) -1, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getexp_round_ph (__m512h __W, __mmask32 __U, __m512h __A,
+ const int __R)
+{
+ return (__m512h) __builtin_ia32_getexpph512_mask ((__v32hf) __A,
+ (__v32hf) __W,
+ (__mmask32) __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getexp_round_ph (__mmask32 __U, __m512h __A, const int __R)
+{
+ return (__m512h) __builtin_ia32_getexpph512_mask ((__v32hf) __A,
+ (__v32hf)
+ _mm512_setzero_ph (),
+ (__mmask32) __U, __R);
+}
+
+#else
+#define _mm_getexp_round_sh(A, B, R) \
+ ((__m128h)__builtin_ia32_getexpsh_mask_round((__v8hf)(__m128h)(A), \
+ (__v8hf)(__m128h)(B), \
+ (__v8hf)_mm_setzero_ph(), \
+ (__mmask8)-1, R))
+
+#define _mm_mask_getexp_round_sh(W, U, A, B, C) \
+ (__m128h)__builtin_ia32_getexpsh_mask_round(A, B, W, U, C)
+
+#define _mm_maskz_getexp_round_sh(U, A, B, C) \
+ (__m128h)__builtin_ia32_getexpsh_mask_round(A, B, \
+ (__v8hf)_mm_setzero_ph(), \
+ U, C)
+
+#define _mm512_getexp_round_ph(A, R) \
+ ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
+ (__v32hf)_mm512_setzero_ph(), (__mmask32)-1, R))
+
+#define _mm512_mask_getexp_round_ph(W, U, A, R) \
+ ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
+ (__v32hf)(__m512h)(W), (__mmask32)(U), R))
+
+#define _mm512_maskz_getexp_round_ph(U, A, R) \
+ ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \
+ (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), R))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vgetmantph, vgetmantsh. */
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getmant_sh (__m128h __A, __m128h __B,
+ _MM_MANTISSA_NORM_ENUM __C,
+ _MM_MANTISSA_SIGN_ENUM __D)
+{
+ return (__m128h)
+ __builtin_ia32_getmantsh_mask_round ((__v8hf) __A, (__v8hf) __B,
+ (__D << 2) | __C, _mm_setzero_ph (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getmant_sh (__m128h __W, __mmask8 __U, __m128h __A,
+ __m128h __B, _MM_MANTISSA_NORM_ENUM __C,
+ _MM_MANTISSA_SIGN_ENUM __D)
+{
+ return (__m128h)
+ __builtin_ia32_getmantsh_mask_round ((__v8hf) __A, (__v8hf) __B,
+ (__D << 2) | __C, (__v8hf) __W,
+ __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getmant_sh (__mmask8 __U, __m128h __A, __m128h __B,
+ _MM_MANTISSA_NORM_ENUM __C,
+ _MM_MANTISSA_SIGN_ENUM __D)
+{
+ return (__m128h)
+ __builtin_ia32_getmantsh_mask_round ((__v8hf) __A, (__v8hf) __B,
+ (__D << 2) | __C,
+ (__v8hf) _mm_setzero_ph(),
+ __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getmant_ph (__m512h __A, _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A,
+ (__C << 2) | __B,
+ _mm512_setzero_ph (),
+ (__mmask32) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getmant_ph (__m512h __W, __mmask32 __U, __m512h __A,
+ _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A,
+ (__C << 2) | __B,
+ (__v32hf) __W, __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getmant_ph (__mmask32 __U, __m512h __A,
+ _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A,
+ (__C << 2) | __B,
+ (__v32hf)
+ _mm512_setzero_ph (),
+ __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getmant_round_sh (__m128h __A, __m128h __B,
+ _MM_MANTISSA_NORM_ENUM __C,
+ _MM_MANTISSA_SIGN_ENUM __D, const int __R)
+{
+ return (__m128h) __builtin_ia32_getmantsh_mask_round ((__v8hf) __A,
+ (__v8hf) __B,
+ (__D << 2) | __C,
+ _mm_setzero_ph (),
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getmant_round_sh (__m128h __W, __mmask8 __U, __m128h __A,
+ __m128h __B, _MM_MANTISSA_NORM_ENUM __C,
+ _MM_MANTISSA_SIGN_ENUM __D, const int __R)
+{
+ return (__m128h) __builtin_ia32_getmantsh_mask_round ((__v8hf) __A,
+ (__v8hf) __B,
+ (__D << 2) | __C,
+ (__v8hf) __W,
+ __U, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getmant_round_sh (__mmask8 __U, __m128h __A, __m128h __B,
+ _MM_MANTISSA_NORM_ENUM __C,
+ _MM_MANTISSA_SIGN_ENUM __D, const int __R)
+{
+ return (__m128h) __builtin_ia32_getmantsh_mask_round ((__v8hf) __A,
+ (__v8hf) __B,
+ (__D << 2) | __C,
+ (__v8hf)
+ _mm_setzero_ph(),
+ __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getmant_round_ph (__m512h __A, _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C, const int __R)
+{
+ return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A,
+ (__C << 2) | __B,
+ _mm512_setzero_ph (),
+ (__mmask32) -1, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getmant_round_ph (__m512h __W, __mmask32 __U, __m512h __A,
+ _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C, const int __R)
+{
+ return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A,
+ (__C << 2) | __B,
+ (__v32hf) __W, __U,
+ __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getmant_round_ph (__mmask32 __U, __m512h __A,
+ _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C, const int __R)
+{
+ return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A,
+ (__C << 2) | __B,
+ (__v32hf)
+ _mm512_setzero_ph (),
+ __U, __R);
+}
+
+#else
+#define _mm512_getmant_ph(X, B, C) \
+ ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X), \
+ (int)(((C)<<2) | (B)), \
+ (__v32hf)(__m512h) \
+ _mm512_setzero_ph(), \
+ (__mmask32)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_getmant_ph(W, U, X, B, C) \
+ ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X), \
+ (int)(((C)<<2) | (B)), \
+ (__v32hf)(__m512h)(W), \
+ (__mmask32)(U), \
+ _MM_FROUND_CUR_DIRECTION))
+
+
+#define _mm512_maskz_getmant_ph(U, X, B, C) \
+ ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X), \
+ (int)(((C)<<2) | (B)), \
+ (__v32hf)(__m512h) \
+ _mm512_setzero_ph(), \
+ (__mmask32)(U), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_getmant_sh(X, Y, C, D) \
+ ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X), \
+ (__v8hf)(__m128h)(Y), \
+ (int)(((D)<<2) | (C)), \
+ (__v8hf)(__m128h) \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_getmant_sh(W, U, X, Y, C, D) \
+ ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X), \
+ (__v8hf)(__m128h)(Y), \
+ (int)(((D)<<2) | (C)), \
+ (__v8hf)(__m128h)(W), \
+ (__mmask8)(U), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_getmant_sh(U, X, Y, C, D) \
+ ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X), \
+ (__v8hf)(__m128h)(Y), \
+ (int)(((D)<<2) | (C)), \
+ (__v8hf)(__m128h) \
+ _mm_setzero_ph(), \
+ (__mmask8)(U), \
+ _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_getmant_round_ph(X, B, C, R) \
+ ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X), \
+ (int)(((C)<<2) | (B)), \
+ (__v32hf)(__m512h) \
+ _mm512_setzero_ph(), \
+ (__mmask32)-1, \
+ (R)))
+
+#define _mm512_mask_getmant_round_ph(W, U, X, B, C, R) \
+ ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X), \
+ (int)(((C)<<2) | (B)), \
+ (__v32hf)(__m512h)(W), \
+ (__mmask32)(U), \
+ (R)))
+
+
+#define _mm512_maskz_getmant_round_ph(U, X, B, C, R) \
+ ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X), \
+ (int)(((C)<<2) | (B)), \
+ (__v32hf)(__m512h) \
+ _mm512_setzero_ph(), \
+ (__mmask32)(U), \
+ (R)))
+
+#define _mm_getmant_round_sh(X, Y, C, D, R) \
+ ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X), \
+ (__v8hf)(__m128h)(Y), \
+ (int)(((D)<<2) | (C)), \
+ (__v8hf)(__m128h) \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, \
+ (R)))
+
+#define _mm_mask_getmant_round_sh(W, U, X, Y, C, D, R) \
+ ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X), \
+ (__v8hf)(__m128h)(Y), \
+ (int)(((D)<<2) | (C)), \
+ (__v8hf)(__m128h)(W), \
+ (__mmask8)(U), \
+ (R)))
+
+#define _mm_maskz_getmant_round_sh(U, X, Y, C, D, R) \
+ ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X), \
+ (__v8hf)(__m128h)(Y), \
+ (int)(((D)<<2) | (C)), \
+ (__v8hf)(__m128h) \
+ _mm_setzero_ph(), \
+ (__mmask8)(U), \
+ (R)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vmovw. */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi16_si128 (short __A)
+{
+ return _mm_set_epi16 (0, 0, 0, 0, 0, 0, 0, __A);
+}
+
+extern __inline short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi128_si16 (__m128i __A)
+{
+ return __builtin_ia32_vec_ext_v8hi ((__v8hi)__A, 0);
+}
+
+/* Intrinsics vmovsh. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_load_sh (__m128h __A, __mmask8 __B, _Float16 const* __C)
+{
+ return __builtin_ia32_loadsh_mask (__C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_load_sh (__mmask8 __A, _Float16 const* __B)
+{
+ return __builtin_ia32_loadsh_mask (__B, _mm_setzero_ph (), __A);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_store_sh (_Float16 const* __A, __mmask8 __B, __m128h __C)
+{
+ __builtin_ia32_storesh_mask (__A, __C, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_move_sh (__m128h __A, __m128h __B)
+{
+ __A[0] = __B[0];
+ return __A;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_move_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return __builtin_ia32_vmovsh_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_move_sh (__mmask8 __A, __m128h __B, __m128h __C)
+{
+ return __builtin_ia32_vmovsh_mask (__B, __C, _mm_setzero_ph (), __A);
+}
+
+/* Intrinsics vcvtph2dq. */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtph_epi32 (__m256h __A)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2dq512_mask_round (__A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtph_epi32 (__m512i __A, __mmask16 __B, __m256h __C)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2dq512_mask_round (__C,
+ (__v16si) __A,
+ __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtph_epi32 (__mmask16 __A, __m256h __B)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2dq512_mask_round (__B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundph_epi32 (__m256h __A, int __B)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2dq512_mask_round (__A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) -1,
+ __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundph_epi32 (__m512i __A, __mmask16 __B, __m256h __C, int __D)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2dq512_mask_round (__C,
+ (__v16si) __A,
+ __B,
+ __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundph_epi32 (__mmask16 __A, __m256h __B, int __C)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2dq512_mask_round (__B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvt_roundph_epi32(A, B) \
+ ((__m512i) \
+ __builtin_ia32_vcvtph2dq512_mask_round ((A), \
+ (__v16si) \
+ _mm512_setzero_si512 (), \
+ (__mmask16)-1, \
+ (B)))
+
+#define _mm512_mask_cvt_roundph_epi32(A, B, C, D) \
+ ((__m512i) \
+ __builtin_ia32_vcvtph2dq512_mask_round ((C), (__v16si)(A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundph_epi32(A, B, C) \
+ ((__m512i) \
+ __builtin_ia32_vcvtph2dq512_mask_round ((B), \
+ (__v16si) \
+ _mm512_setzero_si512 (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtph2udq. */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtph_epu32 (__m256h __A)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2udq512_mask_round (__A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtph_epu32 (__m512i __A, __mmask16 __B, __m256h __C)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2udq512_mask_round (__C,
+ (__v16si) __A,
+ __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtph_epu32 (__mmask16 __A, __m256h __B)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2udq512_mask_round (__B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundph_epu32 (__m256h __A, int __B)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2udq512_mask_round (__A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) -1,
+ __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundph_epu32 (__m512i __A, __mmask16 __B, __m256h __C, int __D)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2udq512_mask_round (__C,
+ (__v16si) __A,
+ __B,
+ __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundph_epu32 (__mmask16 __A, __m256h __B, int __C)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2udq512_mask_round (__B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvt_roundph_epu32(A, B) \
+ ((__m512i) \
+ __builtin_ia32_vcvtph2udq512_mask_round ((A), \
+ (__v16si) \
+ _mm512_setzero_si512 (), \
+ (__mmask16)-1, \
+ (B)))
+
+#define _mm512_mask_cvt_roundph_epu32(A, B, C, D) \
+ ((__m512i) \
+ __builtin_ia32_vcvtph2udq512_mask_round ((C), (__v16si)(A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundph_epu32(A, B, C) \
+ ((__m512i) \
+ __builtin_ia32_vcvtph2udq512_mask_round ((B), \
+ (__v16si) \
+ _mm512_setzero_si512 (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvttph2dq. */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttph_epi32 (__m256h __A)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2dq512_mask_round (__A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttph_epi32 (__m512i __A, __mmask16 __B, __m256h __C)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2dq512_mask_round (__C,
+ (__v16si) __A,
+ __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttph_epi32 (__mmask16 __A, __m256h __B)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2dq512_mask_round (__B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundph_epi32 (__m256h __A, int __B)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2dq512_mask_round (__A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) -1,
+ __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundph_epi32 (__m512i __A, __mmask16 __B,
+ __m256h __C, int __D)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2dq512_mask_round (__C,
+ (__v16si) __A,
+ __B,
+ __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundph_epi32 (__mmask16 __A, __m256h __B, int __C)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2dq512_mask_round (__B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvtt_roundph_epi32(A, B) \
+ ((__m512i) \
+ __builtin_ia32_vcvttph2dq512_mask_round ((A), \
+ (__v16si) \
+ (_mm512_setzero_si512 ()), \
+ (__mmask16)(-1), (B)))
+
+#define _mm512_mask_cvtt_roundph_epi32(A, B, C, D) \
+ ((__m512i) \
+ __builtin_ia32_vcvttph2dq512_mask_round ((C), \
+ (__v16si)(A), \
+ (B), \
+ (D)))
+
+#define _mm512_maskz_cvtt_roundph_epi32(A, B, C) \
+ ((__m512i) \
+ __builtin_ia32_vcvttph2dq512_mask_round ((B), \
+ (__v16si) \
+ _mm512_setzero_si512 (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvttph2udq. */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttph_epu32 (__m256h __A)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2udq512_mask_round (__A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttph_epu32 (__m512i __A, __mmask16 __B, __m256h __C)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2udq512_mask_round (__C,
+ (__v16si) __A,
+ __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttph_epu32 (__mmask16 __A, __m256h __B)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2udq512_mask_round (__B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundph_epu32 (__m256h __A, int __B)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2udq512_mask_round (__A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) -1,
+ __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundph_epu32 (__m512i __A, __mmask16 __B,
+ __m256h __C, int __D)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2udq512_mask_round (__C,
+ (__v16si) __A,
+ __B,
+ __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundph_epu32 (__mmask16 __A, __m256h __B, int __C)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2udq512_mask_round (__B,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvtt_roundph_epu32(A, B) \
+ ((__m512i) \
+ __builtin_ia32_vcvttph2udq512_mask_round ((A), \
+ (__v16si) \
+ _mm512_setzero_si512 (), \
+ (__mmask16)-1, \
+ (B)))
+
+#define _mm512_mask_cvtt_roundph_epu32(A, B, C, D) \
+ ((__m512i) \
+ __builtin_ia32_vcvttph2udq512_mask_round ((C), \
+ (__v16si)(A), \
+ (B), \
+ (D)))
+
+#define _mm512_maskz_cvtt_roundph_epu32(A, B, C) \
+ ((__m512i) \
+ __builtin_ia32_vcvttph2udq512_mask_round ((B), \
+ (__v16si) \
+ _mm512_setzero_si512 (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtdq2ph. */
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi32_ph (__m512i __A)
+{
+ return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __A,
+ _mm256_setzero_ph (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi32_ph (__m256h __A, __mmask16 __B, __m512i __C)
+{
+ return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __C,
+ __A,
+ __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi32_ph (__mmask16 __A, __m512i __B)
+{
+ return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __B,
+ _mm256_setzero_ph (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepi32_ph (__m512i __A, int __B)
+{
+ return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __A,
+ _mm256_setzero_ph (),
+ (__mmask16) -1,
+ __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepi32_ph (__m256h __A, __mmask16 __B, __m512i __C, int __D)
+{
+ return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __C,
+ __A,
+ __B,
+ __D);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepi32_ph (__mmask16 __A, __m512i __B, int __C)
+{
+ return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __B,
+ _mm256_setzero_ph (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvt_roundepi32_ph(A, B) \
+ (__builtin_ia32_vcvtdq2ph512_mask_round ((__v16si)(A), \
+ _mm256_setzero_ph (), \
+ (__mmask16)-1, \
+ (B)))
+
+#define _mm512_mask_cvt_roundepi32_ph(A, B, C, D) \
+ (__builtin_ia32_vcvtdq2ph512_mask_round ((__v16si)(C), \
+ (A), \
+ (B), \
+ (D)))
+
+#define _mm512_maskz_cvt_roundepi32_ph(A, B, C) \
+ (__builtin_ia32_vcvtdq2ph512_mask_round ((__v16si)(B), \
+ _mm256_setzero_ph (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtudq2ph. */
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu32_ph (__m512i __A)
+{
+ return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __A,
+ _mm256_setzero_ph (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu32_ph (__m256h __A, __mmask16 __B, __m512i __C)
+{
+ return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __C,
+ __A,
+ __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu32_ph (__mmask16 __A, __m512i __B)
+{
+ return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __B,
+ _mm256_setzero_ph (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepu32_ph (__m512i __A, int __B)
+{
+ return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __A,
+ _mm256_setzero_ph (),
+ (__mmask16) -1,
+ __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepu32_ph (__m256h __A, __mmask16 __B, __m512i __C, int __D)
+{
+ return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __C,
+ __A,
+ __B,
+ __D);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepu32_ph (__mmask16 __A, __m512i __B, int __C)
+{
+ return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __B,
+ _mm256_setzero_ph (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvt_roundepu32_ph(A, B) \
+ (__builtin_ia32_vcvtudq2ph512_mask_round ((__v16si)(A), \
+ _mm256_setzero_ph (), \
+ (__mmask16)-1, \
+ B))
+
+#define _mm512_mask_cvt_roundepu32_ph(A, B, C, D) \
+ (__builtin_ia32_vcvtudq2ph512_mask_round ((__v16si)C, \
+ A, \
+ B, \
+ D))
+
+#define _mm512_maskz_cvt_roundepu32_ph(A, B, C) \
+ (__builtin_ia32_vcvtudq2ph512_mask_round ((__v16si)B, \
+ _mm256_setzero_ph (), \
+ A, \
+ C))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtph2qq. */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtph_epi64 (__m128h __A)
+{
+ return __builtin_ia32_vcvtph2qq512_mask_round (__A,
+ _mm512_setzero_si512 (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtph_epi64 (__m512i __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvtph2qq512_mask_round (__C, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtph_epi64 (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvtph2qq512_mask_round (__B,
+ _mm512_setzero_si512 (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundph_epi64 (__m128h __A, int __B)
+{
+ return __builtin_ia32_vcvtph2qq512_mask_round (__A,
+ _mm512_setzero_si512 (),
+ (__mmask8) -1,
+ __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundph_epi64 (__m512i __A, __mmask8 __B, __m128h __C, int __D)
+{
+ return __builtin_ia32_vcvtph2qq512_mask_round (__C, __A, __B, __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundph_epi64 (__mmask8 __A, __m128h __B, int __C)
+{
+ return __builtin_ia32_vcvtph2qq512_mask_round (__B,
+ _mm512_setzero_si512 (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvt_roundph_epi64(A, B) \
+ (__builtin_ia32_vcvtph2qq512_mask_round ((A), \
+ _mm512_setzero_si512 (), \
+ (__mmask8)-1, \
+ (B)))
+
+#define _mm512_mask_cvt_roundph_epi64(A, B, C, D) \
+ (__builtin_ia32_vcvtph2qq512_mask_round ((C), (A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundph_epi64(A, B, C) \
+ (__builtin_ia32_vcvtph2qq512_mask_round ((B), \
+ _mm512_setzero_si512 (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtph2uqq. */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtph_epu64 (__m128h __A)
+{
+ return __builtin_ia32_vcvtph2uqq512_mask_round (__A,
+ _mm512_setzero_si512 (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtph_epu64 (__m512i __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvtph2uqq512_mask_round (__C, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtph_epu64 (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvtph2uqq512_mask_round (__B,
+ _mm512_setzero_si512 (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundph_epu64 (__m128h __A, int __B)
+{
+ return __builtin_ia32_vcvtph2uqq512_mask_round (__A,
+ _mm512_setzero_si512 (),
+ (__mmask8) -1,
+ __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundph_epu64 (__m512i __A, __mmask8 __B, __m128h __C, int __D)
+{
+ return __builtin_ia32_vcvtph2uqq512_mask_round (__C, __A, __B, __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundph_epu64 (__mmask8 __A, __m128h __B, int __C)
+{
+ return __builtin_ia32_vcvtph2uqq512_mask_round (__B,
+ _mm512_setzero_si512 (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvt_roundph_epu64(A, B) \
+ (__builtin_ia32_vcvtph2uqq512_mask_round ((A), \
+ _mm512_setzero_si512 (), \
+ (__mmask8)-1, \
+ (B)))
+
+#define _mm512_mask_cvt_roundph_epu64(A, B, C, D) \
+ (__builtin_ia32_vcvtph2uqq512_mask_round ((C), (A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundph_epu64(A, B, C) \
+ (__builtin_ia32_vcvtph2uqq512_mask_round ((B), \
+ _mm512_setzero_si512 (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvttph2qq. */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttph_epi64 (__m128h __A)
+{
+ return __builtin_ia32_vcvttph2qq512_mask_round (__A,
+ _mm512_setzero_si512 (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttph_epi64 (__m512i __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvttph2qq512_mask_round (__C, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttph_epi64 (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvttph2qq512_mask_round (__B,
+ _mm512_setzero_si512 (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundph_epi64 (__m128h __A, int __B)
+{
+ return __builtin_ia32_vcvttph2qq512_mask_round (__A,
+ _mm512_setzero_si512 (),
+ (__mmask8) -1,
+ __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundph_epi64 (__m512i __A, __mmask8 __B, __m128h __C, int __D)
+{
+ return __builtin_ia32_vcvttph2qq512_mask_round (__C, __A, __B, __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundph_epi64 (__mmask8 __A, __m128h __B, int __C)
+{
+ return __builtin_ia32_vcvttph2qq512_mask_round (__B,
+ _mm512_setzero_si512 (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvtt_roundph_epi64(A, B) \
+ (__builtin_ia32_vcvttph2qq512_mask_round ((A), \
+ _mm512_setzero_si512 (), \
+ (__mmask8)-1, \
+ (B)))
+
+#define _mm512_mask_cvtt_roundph_epi64(A, B, C, D) \
+ __builtin_ia32_vcvttph2qq512_mask_round ((C), (A), (B), (D))
+
+#define _mm512_maskz_cvtt_roundph_epi64(A, B, C) \
+ (__builtin_ia32_vcvttph2qq512_mask_round ((B), \
+ _mm512_setzero_si512 (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvttph2uqq. */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttph_epu64 (__m128h __A)
+{
+ return __builtin_ia32_vcvttph2uqq512_mask_round (__A,
+ _mm512_setzero_si512 (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttph_epu64 (__m512i __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvttph2uqq512_mask_round (__C, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttph_epu64 (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvttph2uqq512_mask_round (__B,
+ _mm512_setzero_si512 (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundph_epu64 (__m128h __A, int __B)
+{
+ return __builtin_ia32_vcvttph2uqq512_mask_round (__A,
+ _mm512_setzero_si512 (),
+ (__mmask8) -1,
+ __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundph_epu64 (__m512i __A, __mmask8 __B, __m128h __C, int __D)
+{
+ return __builtin_ia32_vcvttph2uqq512_mask_round (__C, __A, __B, __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundph_epu64 (__mmask8 __A, __m128h __B, int __C)
+{
+ return __builtin_ia32_vcvttph2uqq512_mask_round (__B,
+ _mm512_setzero_si512 (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvtt_roundph_epu64(A, B) \
+ (__builtin_ia32_vcvttph2uqq512_mask_round ((A), \
+ _mm512_setzero_si512 (), \
+ (__mmask8)-1, \
+ (B)))
+
+#define _mm512_mask_cvtt_roundph_epu64(A, B, C, D) \
+ __builtin_ia32_vcvttph2uqq512_mask_round ((C), (A), (B), (D))
+
+#define _mm512_maskz_cvtt_roundph_epu64(A, B, C) \
+ (__builtin_ia32_vcvttph2uqq512_mask_round ((B), \
+ _mm512_setzero_si512 (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtqq2ph. */
+extern __inline __m128h
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi64_ph (__m512i __A)
+{
+ return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi64_ph (__m128h __A, __mmask8 __B, __m512i __C)
+{
+ return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __C,
+ __A,
+ __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi64_ph (__mmask8 __A, __m512i __B)
+{
+ return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __B,
+ _mm_setzero_ph (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepi64_ph (__m512i __A, int __B)
+{
+ return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1,
+ __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepi64_ph (__m128h __A, __mmask8 __B, __m512i __C, int __D)
+{
+ return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __C,
+ __A,
+ __B,
+ __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepi64_ph (__mmask8 __A, __m512i __B, int __C)
+{
+ return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __B,
+ _mm_setzero_ph (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvt_roundepi64_ph(A, B) \
+ (__builtin_ia32_vcvtqq2ph512_mask_round ((__v8di)(A), \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, \
+ (B)))
+
+#define _mm512_mask_cvt_roundepi64_ph(A, B, C, D) \
+ (__builtin_ia32_vcvtqq2ph512_mask_round ((__v8di)(C), (A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundepi64_ph(A, B, C) \
+ (__builtin_ia32_vcvtqq2ph512_mask_round ((__v8di)(B), \
+ _mm_setzero_ph (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtuqq2ph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu64_ph (__m512i __A)
+{
+ return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu64_ph (__m128h __A, __mmask8 __B, __m512i __C)
+{
+ return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __C,
+ __A,
+ __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu64_ph (__mmask8 __A, __m512i __B)
+{
+ return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __B,
+ _mm_setzero_ph (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepu64_ph (__m512i __A, int __B)
+{
+ return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1,
+ __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepu64_ph (__m128h __A, __mmask8 __B, __m512i __C, int __D)
+{
+ return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __C,
+ __A,
+ __B,
+ __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepu64_ph (__mmask8 __A, __m512i __B, int __C)
+{
+ return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __B,
+ _mm_setzero_ph (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvt_roundepu64_ph(A, B) \
+ (__builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di)(A), \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, \
+ (B)))
+
+#define _mm512_mask_cvt_roundepu64_ph(A, B, C, D) \
+ (__builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di)(C), (A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundepu64_ph(A, B, C) \
+ (__builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di)(B), \
+ _mm_setzero_ph (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtph2w. */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtph_epi16 (__m512h __A)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2w512_mask_round (__A,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtph_epi16 (__m512i __A, __mmask32 __B, __m512h __C)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2w512_mask_round (__C,
+ (__v32hi) __A,
+ __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtph_epi16 (__mmask32 __A, __m512h __B)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2w512_mask_round (__B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundph_epi16 (__m512h __A, int __B)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2w512_mask_round (__A,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1,
+ __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundph_epi16 (__m512i __A, __mmask32 __B, __m512h __C, int __D)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2w512_mask_round (__C,
+ (__v32hi) __A,
+ __B,
+ __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundph_epi16 (__mmask32 __A, __m512h __B, int __C)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2w512_mask_round (__B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvt_roundph_epi16(A, B) \
+ ((__m512i)__builtin_ia32_vcvtph2w512_mask_round ((A), \
+ (__v32hi) \
+ _mm512_setzero_si512 (), \
+ (__mmask32)-1, \
+ (B)))
+
+#define _mm512_mask_cvt_roundph_epi16(A, B, C, D) \
+ ((__m512i)__builtin_ia32_vcvtph2w512_mask_round ((C), \
+ (__v32hi)(A), \
+ (B), \
+ (D)))
+
+#define _mm512_maskz_cvt_roundph_epi16(A, B, C) \
+ ((__m512i)__builtin_ia32_vcvtph2w512_mask_round ((B), \
+ (__v32hi) \
+ _mm512_setzero_si512 (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtph2uw. */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtph_epu16 (__m512h __A)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2uw512_mask_round (__A,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtph_epu16 (__m512i __A, __mmask32 __B, __m512h __C)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2uw512_mask_round (__C, (__v32hi) __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtph_epu16 (__mmask32 __A, __m512h __B)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2uw512_mask_round (__B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundph_epu16 (__m512h __A, int __B)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2uw512_mask_round (__A,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1,
+ __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundph_epu16 (__m512i __A, __mmask32 __B, __m512h __C, int __D)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2uw512_mask_round (__C, (__v32hi) __A, __B, __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundph_epu16 (__mmask32 __A, __m512h __B, int __C)
+{
+ return (__m512i)
+ __builtin_ia32_vcvtph2uw512_mask_round (__B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvt_roundph_epu16(A, B) \
+ ((__m512i) \
+ __builtin_ia32_vcvtph2uw512_mask_round ((A), \
+ (__v32hi) \
+ _mm512_setzero_si512 (), \
+ (__mmask32)-1, (B)))
+
+#define _mm512_mask_cvt_roundph_epu16(A, B, C, D) \
+ ((__m512i) \
+ __builtin_ia32_vcvtph2uw512_mask_round ((C), (__v32hi)(A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundph_epu16(A, B, C) \
+ ((__m512i) \
+ __builtin_ia32_vcvtph2uw512_mask_round ((B), \
+ (__v32hi) \
+ _mm512_setzero_si512 (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvttph2w. */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttph_epi16 (__m512h __A)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2w512_mask_round (__A,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttph_epi16 (__m512i __A, __mmask32 __B, __m512h __C)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2w512_mask_round (__C,
+ (__v32hi) __A,
+ __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttph_epi16 (__mmask32 __A, __m512h __B)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2w512_mask_round (__B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundph_epi16 (__m512h __A, int __B)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2w512_mask_round (__A,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1,
+ __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundph_epi16 (__m512i __A, __mmask32 __B,
+ __m512h __C, int __D)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2w512_mask_round (__C,
+ (__v32hi) __A,
+ __B,
+ __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundph_epi16 (__mmask32 __A, __m512h __B, int __C)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2w512_mask_round (__B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvtt_roundph_epi16(A, B) \
+ ((__m512i) \
+ __builtin_ia32_vcvttph2w512_mask_round ((A), \
+ (__v32hi) \
+ _mm512_setzero_si512 (), \
+ (__mmask32)-1, \
+ (B)))
+
+#define _mm512_mask_cvtt_roundph_epi16(A, B, C, D) \
+ ((__m512i) \
+ __builtin_ia32_vcvttph2w512_mask_round ((C), \
+ (__v32hi)(A), \
+ (B), \
+ (D)))
+
+#define _mm512_maskz_cvtt_roundph_epi16(A, B, C) \
+ ((__m512i) \
+ __builtin_ia32_vcvttph2w512_mask_round ((B), \
+ (__v32hi) \
+ _mm512_setzero_si512 (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvttph2uw. */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttph_epu16 (__m512h __A)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2uw512_mask_round (__A,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttph_epu16 (__m512i __A, __mmask32 __B, __m512h __C)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2uw512_mask_round (__C,
+ (__v32hi) __A,
+ __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttph_epu16 (__mmask32 __A, __m512h __B)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2uw512_mask_round (__B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundph_epu16 (__m512h __A, int __B)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2uw512_mask_round (__A,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ (__mmask32) -1,
+ __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundph_epu16 (__m512i __A, __mmask32 __B,
+ __m512h __C, int __D)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2uw512_mask_round (__C,
+ (__v32hi) __A,
+ __B,
+ __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundph_epu16 (__mmask32 __A, __m512h __B, int __C)
+{
+ return (__m512i)
+ __builtin_ia32_vcvttph2uw512_mask_round (__B,
+ (__v32hi)
+ _mm512_setzero_si512 (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvtt_roundph_epu16(A, B) \
+ ((__m512i) \
+ __builtin_ia32_vcvttph2uw512_mask_round ((A), \
+ (__v32hi) \
+ _mm512_setzero_si512 (), \
+ (__mmask32)-1, \
+ (B)))
+
+#define _mm512_mask_cvtt_roundph_epu16(A, B, C, D) \
+ ((__m512i) \
+ __builtin_ia32_vcvttph2uw512_mask_round ((C), \
+ (__v32hi)(A), \
+ (B), \
+ (D)))
+
+#define _mm512_maskz_cvtt_roundph_epu16(A, B, C) \
+ ((__m512i) \
+ __builtin_ia32_vcvttph2uw512_mask_round ((B), \
+ (__v32hi) \
+ _mm512_setzero_si512 (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtw2ph. */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi16_ph (__m512i __A)
+{
+ return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __A,
+ _mm512_setzero_ph (),
+ (__mmask32) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi16_ph (__m512h __A, __mmask32 __B, __m512i __C)
+{
+ return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __C,
+ __A,
+ __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi16_ph (__mmask32 __A, __m512i __B)
+{
+ return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __B,
+ _mm512_setzero_ph (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepi16_ph (__m512i __A, int __B)
+{
+ return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __A,
+ _mm512_setzero_ph (),
+ (__mmask32) -1,
+ __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepi16_ph (__m512h __A, __mmask32 __B, __m512i __C, int __D)
+{
+ return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __C,
+ __A,
+ __B,
+ __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepi16_ph (__mmask32 __A, __m512i __B, int __C)
+{
+ return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __B,
+ _mm512_setzero_ph (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvt_roundepi16_ph(A, B) \
+ (__builtin_ia32_vcvtw2ph512_mask_round ((__v32hi)(A), \
+ _mm512_setzero_ph (), \
+ (__mmask32)-1, \
+ (B)))
+
+#define _mm512_mask_cvt_roundepi16_ph(A, B, C, D) \
+ (__builtin_ia32_vcvtw2ph512_mask_round ((__v32hi)(C), \
+ (A), \
+ (B), \
+ (D)))
+
+#define _mm512_maskz_cvt_roundepi16_ph(A, B, C) \
+ (__builtin_ia32_vcvtw2ph512_mask_round ((__v32hi)(B), \
+ _mm512_setzero_ph (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtuw2ph. */
+ extern __inline __m512h
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+ _mm512_cvtepu16_ph (__m512i __A)
+ {
+ return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __A,
+ _mm512_setzero_ph (),
+ (__mmask32) -1,
+ _MM_FROUND_CUR_DIRECTION);
+ }
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu16_ph (__m512h __A, __mmask32 __B, __m512i __C)
+{
+ return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __C,
+ __A,
+ __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu16_ph (__mmask32 __A, __m512i __B)
+{
+ return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __B,
+ _mm512_setzero_ph (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepu16_ph (__m512i __A, int __B)
+{
+ return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __A,
+ _mm512_setzero_ph (),
+ (__mmask32) -1,
+ __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepu16_ph (__m512h __A, __mmask32 __B, __m512i __C, int __D)
+{
+ return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __C,
+ __A,
+ __B,
+ __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepu16_ph (__mmask32 __A, __m512i __B, int __C)
+{
+ return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __B,
+ _mm512_setzero_ph (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvt_roundepu16_ph(A, B) \
+ (__builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi)(A), \
+ _mm512_setzero_ph (), \
+ (__mmask32)-1, \
+ (B)))
+
+#define _mm512_mask_cvt_roundepu16_ph(A, B, C, D) \
+ (__builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi)(C), \
+ (A), \
+ (B), \
+ (D)))
+
+#define _mm512_maskz_cvt_roundepu16_ph(A, B, C) \
+ (__builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi)(B), \
+ _mm512_setzero_ph (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtsh2si, vcvtsh2us. */
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsh_i32 (__m128h __A)
+{
+ return (int) __builtin_ia32_vcvtsh2si32_round (__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsh_u32 (__m128h __A)
+{
+ return (int) __builtin_ia32_vcvtsh2usi32_round (__A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsh_i32 (__m128h __A, const int __R)
+{
+ return (int) __builtin_ia32_vcvtsh2si32_round (__A, __R);
+}
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsh_u32 (__m128h __A, const int __R)
+{
+ return (int) __builtin_ia32_vcvtsh2usi32_round (__A, __R);
+}
+
+#else
+#define _mm_cvt_roundsh_i32(A, B) \
+ ((int)__builtin_ia32_vcvtsh2si32_round ((A), (B)))
+#define _mm_cvt_roundsh_u32(A, B) \
+ ((int)__builtin_ia32_vcvtsh2usi32_round ((A), (B)))
+
+#endif /* __OPTIMIZE__ */
+
+#ifdef __x86_64__
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsh_i64 (__m128h __A)
+{
+ return (long long)
+ __builtin_ia32_vcvtsh2si64_round (__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsh_u64 (__m128h __A)
+{
+ return (long long)
+ __builtin_ia32_vcvtsh2usi64_round (__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsh_i64 (__m128h __A, const int __R)
+{
+ return (long long) __builtin_ia32_vcvtsh2si64_round (__A, __R);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsh_u64 (__m128h __A, const int __R)
+{
+ return (long long) __builtin_ia32_vcvtsh2usi64_round (__A, __R);
+}
+
+#else
+#define _mm_cvt_roundsh_i64(A, B) \
+ ((long long)__builtin_ia32_vcvtsh2si64_round ((A), (B)))
+#define _mm_cvt_roundsh_u64(A, B) \
+ ((long long)__builtin_ia32_vcvtsh2usi64_round ((A), (B)))
+
+#endif /* __OPTIMIZE__ */
+#endif /* __x86_64__ */
+
+/* Intrinsics vcvttsh2si, vcvttsh2us. */
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsh_i32 (__m128h __A)
+{
+ return (int)
+ __builtin_ia32_vcvttsh2si32_round (__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsh_u32 (__m128h __A)
+{
+ return (int)
+ __builtin_ia32_vcvttsh2usi32_round (__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsh_i32 (__m128h __A, const int __R)
+{
+ return (int) __builtin_ia32_vcvttsh2si32_round (__A, __R);
+}
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsh_u32 (__m128h __A, const int __R)
+{
+ return (int) __builtin_ia32_vcvttsh2usi32_round (__A, __R);
+}
+
+#else
+#define _mm_cvtt_roundsh_i32(A, B) \
+ ((int)__builtin_ia32_vcvttsh2si32_round ((A), (B)))
+#define _mm_cvtt_roundsh_u32(A, B) \
+ ((int)__builtin_ia32_vcvttsh2usi32_round ((A), (B)))
+
+#endif /* __OPTIMIZE__ */
+
+#ifdef __x86_64__
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsh_i64 (__m128h __A)
+{
+ return (long long)
+ __builtin_ia32_vcvttsh2si64_round (__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsh_u64 (__m128h __A)
+{
+ return (long long)
+ __builtin_ia32_vcvttsh2usi64_round (__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsh_i64 (__m128h __A, const int __R)
+{
+ return (long long) __builtin_ia32_vcvttsh2si64_round (__A, __R);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsh_u64 (__m128h __A, const int __R)
+{
+ return (long long) __builtin_ia32_vcvttsh2usi64_round (__A, __R);
+}
+
+#else
+#define _mm_cvtt_roundsh_i64(A, B) \
+ ((long long)__builtin_ia32_vcvttsh2si64_round ((A), (B)))
+#define _mm_cvtt_roundsh_u64(A, B) \
+ ((long long)__builtin_ia32_vcvttsh2usi64_round ((A), (B)))
+
+#endif /* __OPTIMIZE__ */
+#endif /* __x86_64__ */
+
+/* Intrinsics vcvtsi2sh, vcvtusi2sh. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvti32_sh (__m128h __A, int __B)
+{
+ return __builtin_ia32_vcvtsi2sh32_round (__A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtu32_sh (__m128h __A, unsigned int __B)
+{
+ return __builtin_ia32_vcvtusi2sh32_round (__A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundi32_sh (__m128h __A, int __B, const int __R)
+{
+ return __builtin_ia32_vcvtsi2sh32_round (__A, __B, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundu32_sh (__m128h __A, unsigned int __B, const int __R)
+{
+ return __builtin_ia32_vcvtusi2sh32_round (__A, __B, __R);
+}
+
+#else
+#define _mm_cvt_roundi32_sh(A, B, C) \
+ (__builtin_ia32_vcvtsi2sh32_round ((A), (B), (C)))
+#define _mm_cvt_roundu32_sh(A, B, C) \
+ (__builtin_ia32_vcvtusi2sh32_round ((A), (B), (C)))
+
+#endif /* __OPTIMIZE__ */
+
+#ifdef __x86_64__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvti64_sh (__m128h __A, long long __B)
+{
+ return __builtin_ia32_vcvtsi2sh64_round (__A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtu64_sh (__m128h __A, unsigned long long __B)
+{
+ return __builtin_ia32_vcvtusi2sh64_round (__A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundi64_sh (__m128h __A, long long __B, const int __R)
+{
+ return __builtin_ia32_vcvtsi2sh64_round (__A, __B, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundu64_sh (__m128h __A, unsigned long long __B, const int __R)
+{
+ return __builtin_ia32_vcvtusi2sh64_round (__A, __B, __R);
+}
+
+#else
+#define _mm_cvt_roundi64_sh(A, B, C) \
+ (__builtin_ia32_vcvtsi2sh64_round ((A), (B), (C)))
+#define _mm_cvt_roundu64_sh(A, B, C) \
+ (__builtin_ia32_vcvtusi2sh64_round ((A), (B), (C)))
+
+#endif /* __OPTIMIZE__ */
+#endif /* __x86_64__ */
+
+/* Intrinsics vcvtph2pd. */
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtph_pd (__m128h __A)
+{
+ return __builtin_ia32_vcvtph2pd512_mask_round (__A,
+ _mm512_setzero_pd (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtph_pd (__m512d __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvtph2pd512_mask_round (__C, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtph_pd (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvtph2pd512_mask_round (__B,
+ _mm512_setzero_pd (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundph_pd (__m128h __A, int __B)
+{
+ return __builtin_ia32_vcvtph2pd512_mask_round (__A,
+ _mm512_setzero_pd (),
+ (__mmask8) -1,
+ __B);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundph_pd (__m512d __A, __mmask8 __B, __m128h __C, int __D)
+{
+ return __builtin_ia32_vcvtph2pd512_mask_round (__C, __A, __B, __D);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundph_pd (__mmask8 __A, __m128h __B, int __C)
+{
+ return __builtin_ia32_vcvtph2pd512_mask_round (__B,
+ _mm512_setzero_pd (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvt_roundph_pd(A, B) \
+ (__builtin_ia32_vcvtph2pd512_mask_round ((A), \
+ _mm512_setzero_pd (), \
+ (__mmask8)-1, \
+ (B)))
+
+#define _mm512_mask_cvt_roundph_pd(A, B, C, D) \
+ (__builtin_ia32_vcvtph2pd512_mask_round ((C), (A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundph_pd(A, B, C) \
+ (__builtin_ia32_vcvtph2pd512_mask_round ((B), \
+ _mm512_setzero_pd (), \
+ (A), \
+ (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtph2psx. */
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtxph_ps (__m256h __A)
+{
+ return __builtin_ia32_vcvtph2psx512_mask_round (__A,
+ _mm512_setzero_ps (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtxph_ps (__m512 __A, __mmask16 __B, __m256h __C)
+{
+ return __builtin_ia32_vcvtph2psx512_mask_round (__C, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtxph_ps (__mmask16 __A, __m256h __B)
+{
+ return __builtin_ia32_vcvtph2psx512_mask_round (__B,
+ _mm512_setzero_ps (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtx_roundph_ps (__m256h __A, int __B)
+{
+ return __builtin_ia32_vcvtph2psx512_mask_round (__A,
+ _mm512_setzero_ps (),
+ (__mmask16) -1,
+ __B);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtx_roundph_ps (__m512 __A, __mmask16 __B, __m256h __C, int __D)
+{
+ return __builtin_ia32_vcvtph2psx512_mask_round (__C, __A, __B, __D);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtx_roundph_ps (__mmask16 __A, __m256h __B, int __C)
+{
+ return __builtin_ia32_vcvtph2psx512_mask_round (__B,
+ _mm512_setzero_ps (),
+ __A,
+ __C);
+}
+
+#else
+#define _mm512_cvtx_roundph_ps(A, B) \
+ (__builtin_ia32_vcvtph2psx512_mask_round ((A), \
+ _mm512_setzero_ps (), \
+ (__mmask16)-1, \
+ (B)))
+
+#define _mm512_mask_cvtx_roundph_ps(A, B, C, D) \
+ (__builtin_ia32_vcvtph2psx512_mask_round ((C), (A), (B), (D)))
+
+#define _mm512_maskz_cvtx_roundph_ps(A, B, C) \
+ (__builtin_ia32_vcvtph2psx512_mask_round ((B), \
+ _mm512_setzero_ps (), \
+ (A), \
+ (C)))
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtps2ph. */
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtxps_ph (__m512 __A)
+{
+ return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __A,
+ _mm256_setzero_ph (),
+ (__mmask16) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtxps_ph (__m256h __A, __mmask16 __B, __m512 __C)
+{
+ return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __C,
+ __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtxps_ph (__mmask16 __A, __m512 __B)
+{
+ return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __B,
+ _mm256_setzero_ph (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtx_roundps_ph (__m512 __A, int __B)
+{
+ return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __A,
+ _mm256_setzero_ph (),
+ (__mmask16) -1,
+ __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtx_roundps_ph (__m256h __A, __mmask16 __B, __m512 __C, int __D)
+{
+ return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __C,
+ __A, __B, __D);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtx_roundps_ph (__mmask16 __A, __m512 __B, int __C)
+{
+ return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __B,
+ _mm256_setzero_ph (),
+ __A, __C);
+}
+
+#else
+#define _mm512_cvtx_roundps_ph(A, B) \
+ (__builtin_ia32_vcvtps2phx512_mask_round ((__v16sf)(A), \
+ _mm256_setzero_ph (),\
+ (__mmask16)-1, (B)))
+
+#define _mm512_mask_cvtx_roundps_ph(A, B, C, D) \
+ (__builtin_ia32_vcvtps2phx512_mask_round ((__v16sf)(C), \
+ (A), (B), (D)))
+
+#define _mm512_maskz_cvtx_roundps_ph(A, B, C) \
+ (__builtin_ia32_vcvtps2phx512_mask_round ((__v16sf)(B), \
+ _mm256_setzero_ph (),\
+ (A), (C)))
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtpd2ph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtpd_ph (__m512d __A)
+{
+ return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtpd_ph (__m128h __A, __mmask8 __B, __m512d __C)
+{
+ return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __C,
+ __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtpd_ph (__mmask8 __A, __m512d __B)
+{
+ return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __B,
+ _mm_setzero_ph (),
+ __A,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundpd_ph (__m512d __A, int __B)
+{
+ return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1,
+ __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundpd_ph (__m128h __A, __mmask8 __B, __m512d __C, int __D)
+{
+ return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __C,
+ __A, __B, __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundpd_ph (__mmask8 __A, __m512d __B, int __C)
+{
+ return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __B,
+ _mm_setzero_ph (),
+ __A, __C);
+}
+
+#else
+#define _mm512_cvt_roundpd_ph(A, B) \
+ (__builtin_ia32_vcvtpd2ph512_mask_round ((__v8df)(A), \
+ _mm_setzero_ph (), \
+ (__mmask8)-1, (B)))
+
+#define _mm512_mask_cvt_roundpd_ph(A, B, C, D) \
+ (__builtin_ia32_vcvtpd2ph512_mask_round ((__v8df)(C), \
+ (A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundpd_ph(A, B, C) \
+ (__builtin_ia32_vcvtpd2ph512_mask_round ((__v8df)(B), \
+ _mm_setzero_ph (), \
+ (A), (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtsh2ss, vcvtsh2sd. */
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsh_ss (__m128 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvtsh2ss_mask_round (__B, __A,
+ _mm_setzero_ps (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsh_ss (__m128 __A, __mmask8 __B, __m128 __C,
+ __m128h __D)
+{
+ return __builtin_ia32_vcvtsh2ss_mask_round (__D, __C, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtsh_ss (__mmask8 __A, __m128 __B,
+ __m128h __C)
+{
+ return __builtin_ia32_vcvtsh2ss_mask_round (__C, __B,
+ _mm_setzero_ps (),
+ __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsh_sd (__m128d __A, __m128h __B)
+{
+ return __builtin_ia32_vcvtsh2sd_mask_round (__B, __A,
+ _mm_setzero_pd (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsh_sd (__m128d __A, __mmask8 __B, __m128d __C,
+ __m128h __D)
+{
+ return __builtin_ia32_vcvtsh2sd_mask_round (__D, __C, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtsh_sd (__mmask8 __A, __m128d __B, __m128h __C)
+{
+ return __builtin_ia32_vcvtsh2sd_mask_round (__C, __B,
+ _mm_setzero_pd (),
+ __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsh_ss (__m128 __A, __m128h __B, const int __R)
+{
+ return __builtin_ia32_vcvtsh2ss_mask_round (__B, __A,
+ _mm_setzero_ps (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvt_roundsh_ss (__m128 __A, __mmask8 __B, __m128 __C,
+ __m128h __D, const int __R)
+{
+ return __builtin_ia32_vcvtsh2ss_mask_round (__D, __C, __A, __B, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvt_roundsh_ss (__mmask8 __A, __m128 __B,
+ __m128h __C, const int __R)
+{
+ return __builtin_ia32_vcvtsh2ss_mask_round (__C, __B,
+ _mm_setzero_ps (),
+ __A, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsh_sd (__m128d __A, __m128h __B, const int __R)
+{
+ return __builtin_ia32_vcvtsh2sd_mask_round (__B, __A,
+ _mm_setzero_pd (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvt_roundsh_sd (__m128d __A, __mmask8 __B, __m128d __C,
+ __m128h __D, const int __R)
+{
+ return __builtin_ia32_vcvtsh2sd_mask_round (__D, __C, __A, __B, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvt_roundsh_sd (__mmask8 __A, __m128d __B, __m128h __C, const int __R)
+{
+ return __builtin_ia32_vcvtsh2sd_mask_round (__C, __B,
+ _mm_setzero_pd (),
+ __A, __R);
+}
+
+#else
+#define _mm_cvt_roundsh_ss(A, B, R) \
+ (__builtin_ia32_vcvtsh2ss_mask_round ((B), (A), \
+ _mm_setzero_ps (), \
+ (__mmask8) -1, (R)))
+
+#define _mm_mask_cvt_roundsh_ss(A, B, C, D, R) \
+ (__builtin_ia32_vcvtsh2ss_mask_round ((D), (C), (A), (B), (R)))
+
+#define _mm_maskz_cvt_roundsh_ss(A, B, C, R) \
+ (__builtin_ia32_vcvtsh2ss_mask_round ((C), (B), \
+ _mm_setzero_ps (), \
+ (A), (R)))
+
+#define _mm_cvt_roundsh_sd(A, B, R) \
+ (__builtin_ia32_vcvtsh2sd_mask_round ((B), (A), \
+ _mm_setzero_pd (), \
+ (__mmask8) -1, (R)))
+
+#define _mm_mask_cvt_roundsh_sd(A, B, C, D, R) \
+ (__builtin_ia32_vcvtsh2sd_mask_round ((D), (C), (A), (B), (R)))
+
+#define _mm_maskz_cvt_roundsh_sd(A, B, C, R) \
+ (__builtin_ia32_vcvtsh2sd_mask_round ((C), (B), \
+ _mm_setzero_pd (), \
+ (A), (R)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtss2sh, vcvtsd2sh. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_sh (__m128h __A, __m128 __B)
+{
+ return __builtin_ia32_vcvtss2sh_mask_round (__B, __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtss_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128 __D)
+{
+ return __builtin_ia32_vcvtss2sh_mask_round (__D, __C, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtss_sh (__mmask8 __A, __m128h __B, __m128 __C)
+{
+ return __builtin_ia32_vcvtss2sh_mask_round (__C, __B,
+ _mm_setzero_ph (),
+ __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_sh (__m128h __A, __m128d __B)
+{
+ return __builtin_ia32_vcvtsd2sh_mask_round (__B, __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsd_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128d __D)
+{
+ return __builtin_ia32_vcvtsd2sh_mask_round (__D, __C, __A, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtsd_sh (__mmask8 __A, __m128h __B, __m128d __C)
+{
+ return __builtin_ia32_vcvtsd2sh_mask_round (__C, __B,
+ _mm_setzero_ph (),
+ __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundss_sh (__m128h __A, __m128 __B, const int __R)
+{
+ return __builtin_ia32_vcvtss2sh_mask_round (__B, __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvt_roundss_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128 __D,
+ const int __R)
+{
+ return __builtin_ia32_vcvtss2sh_mask_round (__D, __C, __A, __B, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvt_roundss_sh (__mmask8 __A, __m128h __B, __m128 __C,
+ const int __R)
+{
+ return __builtin_ia32_vcvtss2sh_mask_round (__C, __B,
+ _mm_setzero_ph (),
+ __A, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsd_sh (__m128h __A, __m128d __B, const int __R)
+{
+ return __builtin_ia32_vcvtsd2sh_mask_round (__B, __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvt_roundsd_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128d __D,
+ const int __R)
+{
+ return __builtin_ia32_vcvtsd2sh_mask_round (__D, __C, __A, __B, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvt_roundsd_sh (__mmask8 __A, __m128h __B, __m128d __C,
+ const int __R)
+{
+ return __builtin_ia32_vcvtsd2sh_mask_round (__C, __B,
+ _mm_setzero_ph (),
+ __A, __R);
+}
+
+#else
+#define _mm_cvt_roundss_sh(A, B, R) \
+ (__builtin_ia32_vcvtss2sh_mask_round ((B), (A), \
+ _mm_setzero_ph (), \
+ (__mmask8) -1, R))
+
+#define _mm_mask_cvt_roundss_sh(A, B, C, D, R) \
+ (__builtin_ia32_vcvtss2sh_mask_round ((D), (C), (A), (B), (R)))
+
+#define _mm_maskz_cvt_roundss_sh(A, B, C, R) \
+ (__builtin_ia32_vcvtss2sh_mask_round ((C), (B), \
+ _mm_setzero_ph (), \
+ A, R))
+
+#define _mm_cvt_roundsd_sh(A, B, R) \
+ (__builtin_ia32_vcvtsd2sh_mask_round ((B), (A), \
+ _mm_setzero_ph (), \
+ (__mmask8) -1, R))
+
+#define _mm_mask_cvt_roundsd_sh(A, B, C, D, R) \
+ (__builtin_ia32_vcvtsd2sh_mask_round ((D), (C), (A), (B), (R)))
+
+#define _mm_maskz_cvt_roundsd_sh(A, B, C, R) \
+ (__builtin_ia32_vcvtsd2sh_mask_round ((C), (B), \
+ _mm_setzero_ph (), \
+ (A), (R)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vfmaddsub[132,213,231]ph. */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmaddsub_ph (__m512h __A, __m512h __B, __m512h __C)
+{
+ return (__m512h)
+ __builtin_ia32_vfmaddsubph512_mask ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmaddsub_ph (__m512h __A, __mmask32 __U, __m512h __B, __m512h __C)
+{
+ return (__m512h)
+ __builtin_ia32_vfmaddsubph512_mask ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmaddsub_ph (__m512h __A, __m512h __B, __m512h __C, __mmask32 __U)
+{
+ return (__m512h)
+ __builtin_ia32_vfmaddsubph512_mask3 ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmaddsub_ph (__mmask32 __U, __m512h __A, __m512h __B, __m512h __C)
+{
+ return (__m512h)
+ __builtin_ia32_vfmaddsubph512_maskz ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmaddsub_round_ph (__m512h __A, __m512h __B, __m512h __C, const int __R)
+{
+ return (__m512h)
+ __builtin_ia32_vfmaddsubph512_mask ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) -1, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmaddsub_round_ph (__m512h __A, __mmask32 __U, __m512h __B,
+ __m512h __C, const int __R)
+{
+ return (__m512h)
+ __builtin_ia32_vfmaddsubph512_mask ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmaddsub_round_ph (__m512h __A, __m512h __B, __m512h __C,
+ __mmask32 __U, const int __R)
+{
+ return (__m512h)
+ __builtin_ia32_vfmaddsubph512_mask3 ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmaddsub_round_ph (__mmask32 __U, __m512h __A, __m512h __B,
+ __m512h __C, const int __R)
+{
+ return (__m512h)
+ __builtin_ia32_vfmaddsubph512_maskz ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U, __R);
+}
+
+#else
+#define _mm512_fmaddsub_round_ph(A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddsubph512_mask ((A), (B), (C), -1, (R)))
+
+#define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddsubph512_mask ((A), (B), (C), (U), (R)))
+
+#define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R) \
+ ((__m512h)__builtin_ia32_vfmaddsubph512_mask3 ((A), (B), (C), (U), (R)))
+
+#define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddsubph512_maskz ((A), (B), (C), (U), (R)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vfmsubadd[132,213,231]ph. */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+ _mm512_fmsubadd_ph (__m512h __A, __m512h __B, __m512h __C)
+{
+ return (__m512h)
+ __builtin_ia32_vfmsubaddph512_mask ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmsubadd_ph (__m512h __A, __mmask32 __U,
+ __m512h __B, __m512h __C)
+{
+ return (__m512h)
+ __builtin_ia32_vfmsubaddph512_mask ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmsubadd_ph (__m512h __A, __m512h __B,
+ __m512h __C, __mmask32 __U)
+{
+ return (__m512h)
+ __builtin_ia32_vfmsubaddph512_mask3 ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmsubadd_ph (__mmask32 __U, __m512h __A,
+ __m512h __B, __m512h __C)
+{
+ return (__m512h)
+ __builtin_ia32_vfmsubaddph512_maskz ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmsubadd_round_ph (__m512h __A, __m512h __B,
+ __m512h __C, const int __R)
+{
+ return (__m512h)
+ __builtin_ia32_vfmsubaddph512_mask ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) -1, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmsubadd_round_ph (__m512h __A, __mmask32 __U, __m512h __B,
+ __m512h __C, const int __R)
+{
+ return (__m512h)
+ __builtin_ia32_vfmsubaddph512_mask ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmsubadd_round_ph (__m512h __A, __m512h __B, __m512h __C,
+ __mmask32 __U, const int __R)
+{
+ return (__m512h)
+ __builtin_ia32_vfmsubaddph512_mask3 ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmsubadd_round_ph (__mmask32 __U, __m512h __A, __m512h __B,
+ __m512h __C, const int __R)
+{
+ return (__m512h)
+ __builtin_ia32_vfmsubaddph512_maskz ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U, __R);
+}
+
+#else
+#define _mm512_fmsubadd_round_ph(A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmsubaddph512_mask ((A), (B), (C), -1, (R)))
+
+#define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmsubaddph512_mask ((A), (B), (C), (U), (R)))
+
+#define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R) \
+ ((__m512h)__builtin_ia32_vfmsubaddph512_mask3 ((A), (B), (C), (U), (R)))
+
+#define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmsubaddph512_maskz ((A), (B), (C), (U), (R)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vfmadd[132,213,231]ph. */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+ _mm512_fmadd_ph (__m512h __A, __m512h __B, __m512h __C)
+{
+ return (__m512h)
+ __builtin_ia32_vfmaddph512_mask ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmadd_ph (__m512h __A, __mmask32 __U, __m512h __B, __m512h __C)
+{
+ return (__m512h)
+ __builtin_ia32_vfmaddph512_mask ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmadd_ph (__m512h __A, __m512h __B, __m512h __C, __mmask32 __U)
+{
+ return (__m512h)
+ __builtin_ia32_vfmaddph512_mask3 ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmadd_ph (__mmask32 __U, __m512h __A, __m512h __B, __m512h __C)
+{
+ return (__m512h)
+ __builtin_ia32_vfmaddph512_maskz ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmadd_round_ph (__m512h __A, __m512h __B, __m512h __C, const int __R)
+{
+ return (__m512h) __builtin_ia32_vfmaddph512_mask ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) -1, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmadd_round_ph (__m512h __A, __mmask32 __U, __m512h __B,
+ __m512h __C, const int __R)
+{
+ return (__m512h) __builtin_ia32_vfmaddph512_mask ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmadd_round_ph (__m512h __A, __m512h __B, __m512h __C,
+ __mmask32 __U, const int __R)
+{
+ return (__m512h) __builtin_ia32_vfmaddph512_mask3 ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmadd_round_ph (__mmask32 __U, __m512h __A, __m512h __B,
+ __m512h __C, const int __R)
+{
+ return (__m512h) __builtin_ia32_vfmaddph512_maskz ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U, __R);
+}
+
+#else
+#define _mm512_fmadd_round_ph(A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddph512_mask ((A), (B), (C), -1, (R)))
+
+#define _mm512_mask_fmadd_round_ph(A, U, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddph512_mask ((A), (B), (C), (U), (R)))
+
+#define _mm512_mask3_fmadd_round_ph(A, B, C, U, R) \
+ ((__m512h)__builtin_ia32_vfmaddph512_mask3 ((A), (B), (C), (U), (R)))
+
+#define _mm512_maskz_fmadd_round_ph(U, A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmaddph512_maskz ((A), (B), (C), (U), (R)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vfnmadd[132,213,231]ph. */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fnmadd_ph (__m512h __A, __m512h __B, __m512h __C)
+{
+ return (__m512h)
+ __builtin_ia32_vfnmaddph512_mask ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fnmadd_ph (__m512h __A, __mmask32 __U, __m512h __B, __m512h __C)
+{
+ return (__m512h)
+ __builtin_ia32_vfnmaddph512_mask ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fnmadd_ph (__m512h __A, __m512h __B, __m512h __C, __mmask32 __U)
+{
+ return (__m512h)
+ __builtin_ia32_vfnmaddph512_mask3 ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fnmadd_ph (__mmask32 __U, __m512h __A, __m512h __B, __m512h __C)
+{
+ return (__m512h)
+ __builtin_ia32_vfnmaddph512_maskz ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fnmadd_round_ph (__m512h __A, __m512h __B, __m512h __C, const int __R)
+{
+ return (__m512h) __builtin_ia32_vfnmaddph512_mask ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) -1, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fnmadd_round_ph (__m512h __A, __mmask32 __U, __m512h __B,
+ __m512h __C, const int __R)
+{
+ return (__m512h) __builtin_ia32_vfnmaddph512_mask ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fnmadd_round_ph (__m512h __A, __m512h __B, __m512h __C,
+ __mmask32 __U, const int __R)
+{
+ return (__m512h) __builtin_ia32_vfnmaddph512_mask3 ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fnmadd_round_ph (__mmask32 __U, __m512h __A, __m512h __B,
+ __m512h __C, const int __R)
+{
+ return (__m512h) __builtin_ia32_vfnmaddph512_maskz ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U, __R);
+}
+
+#else
+#define _mm512_fnmadd_round_ph(A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfnmaddph512_mask ((A), (B), (C), -1, (R)))
+
+#define _mm512_mask_fnmadd_round_ph(A, U, B, C, R) \
+ ((__m512h)__builtin_ia32_vfnmaddph512_mask ((A), (B), (C), (U), (R)))
+
+#define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R) \
+ ((__m512h)__builtin_ia32_vfnmaddph512_mask3 ((A), (B), (C), (U), (R)))
+
+#define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfnmaddph512_maskz ((A), (B), (C), (U), (R)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vfmsub[132,213,231]ph. */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmsub_ph (__m512h __A, __m512h __B, __m512h __C)
+{
+ return (__m512h)
+ __builtin_ia32_vfmsubph512_mask ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmsub_ph (__m512h __A, __mmask32 __U, __m512h __B, __m512h __C)
+{
+ return (__m512h)
+ __builtin_ia32_vfmsubph512_mask ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmsub_ph (__m512h __A, __m512h __B, __m512h __C, __mmask32 __U)
+{
+ return (__m512h)
+ __builtin_ia32_vfmsubph512_mask3 ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmsub_ph (__mmask32 __U, __m512h __A, __m512h __B, __m512h __C)
+{
+ return (__m512h)
+ __builtin_ia32_vfmsubph512_maskz ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmsub_round_ph (__m512h __A, __m512h __B, __m512h __C, const int __R)
+{
+ return (__m512h) __builtin_ia32_vfmsubph512_mask ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) -1, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmsub_round_ph (__m512h __A, __mmask32 __U, __m512h __B,
+ __m512h __C, const int __R)
+{
+ return (__m512h) __builtin_ia32_vfmsubph512_mask ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmsub_round_ph (__m512h __A, __m512h __B, __m512h __C,
+ __mmask32 __U, const int __R)
+{
+ return (__m512h) __builtin_ia32_vfmsubph512_mask3 ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmsub_round_ph (__mmask32 __U, __m512h __A, __m512h __B,
+ __m512h __C, const int __R)
+{
+ return (__m512h) __builtin_ia32_vfmsubph512_maskz ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U, __R);
+}
+
+#else
+#define _mm512_fmsub_round_ph(A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmsubph512_mask ((A), (B), (C), -1, (R)))
+
+#define _mm512_mask_fmsub_round_ph(A, U, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmsubph512_mask ((A), (B), (C), (U), (R)))
+
+#define _mm512_mask3_fmsub_round_ph(A, B, C, U, R) \
+ ((__m512h)__builtin_ia32_vfmsubph512_mask3 ((A), (B), (C), (U), (R)))
+
+#define _mm512_maskz_fmsub_round_ph(U, A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfmsubph512_maskz ((A), (B), (C), (U), (R)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vfnmsub[132,213,231]ph. */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fnmsub_ph (__m512h __A, __m512h __B, __m512h __C)
+{
+ return (__m512h)
+ __builtin_ia32_vfnmsubph512_mask ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fnmsub_ph (__m512h __A, __mmask32 __U, __m512h __B, __m512h __C)
+{
+ return (__m512h)
+ __builtin_ia32_vfnmsubph512_mask ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fnmsub_ph (__m512h __A, __m512h __B, __m512h __C, __mmask32 __U)
+{
+ return (__m512h)
+ __builtin_ia32_vfnmsubph512_mask3 ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fnmsub_ph (__mmask32 __U, __m512h __A, __m512h __B, __m512h __C)
+{
+ return (__m512h)
+ __builtin_ia32_vfnmsubph512_maskz ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fnmsub_round_ph (__m512h __A, __m512h __B, __m512h __C, const int __R)
+{
+ return (__m512h) __builtin_ia32_vfnmsubph512_mask ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) -1, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fnmsub_round_ph (__m512h __A, __mmask32 __U, __m512h __B,
+ __m512h __C, const int __R)
+{
+ return (__m512h) __builtin_ia32_vfnmsubph512_mask ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fnmsub_round_ph (__m512h __A, __m512h __B, __m512h __C,
+ __mmask32 __U, const int __R)
+{
+ return (__m512h) __builtin_ia32_vfnmsubph512_mask3 ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fnmsub_round_ph (__mmask32 __U, __m512h __A, __m512h __B,
+ __m512h __C, const int __R)
+{
+ return (__m512h) __builtin_ia32_vfnmsubph512_maskz ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ (__mmask32) __U, __R);
+}
+
+#else
+#define _mm512_fnmsub_round_ph(A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfnmsubph512_mask ((A), (B), (C), -1, (R)))
+
+#define _mm512_mask_fnmsub_round_ph(A, U, B, C, R) \
+ ((__m512h)__builtin_ia32_vfnmsubph512_mask ((A), (B), (C), (U), (R)))
+
+#define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R) \
+ ((__m512h)__builtin_ia32_vfnmsubph512_mask3 ((A), (B), (C), (U), (R)))
+
+#define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R) \
+ ((__m512h)__builtin_ia32_vfnmsubph512_maskz ((A), (B), (C), (U), (R)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vfmadd[132,213,231]sh. */
+extern __inline __m128h
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_sh (__m128h __W, __m128h __A, __m128h __B)
+{
+ return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W,
+ (__v8hf) __A,
+ (__v8hf) __B,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmadd_sh (__m128h __W, __mmask8 __U, __m128h __A, __m128h __B)
+{
+ return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W,
+ (__v8hf) __A,
+ (__v8hf) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmadd_sh (__m128h __W, __m128h __A, __m128h __B, __mmask8 __U)
+{
+ return (__m128h) __builtin_ia32_vfmaddsh3_mask3 ((__v8hf) __W,
+ (__v8hf) __A,
+ (__v8hf) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmadd_sh (__mmask8 __U, __m128h __W, __m128h __A, __m128h __B)
+{
+ return (__m128h) __builtin_ia32_vfmaddsh3_maskz ((__v8hf) __W,
+ (__v8hf) __A,
+ (__v8hf) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_round_sh (__m128h __W, __m128h __A, __m128h __B, const int __R)
+{
+ return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W,
+ (__v8hf) __A,
+ (__v8hf) __B,
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmadd_round_sh (__m128h __W, __mmask8 __U, __m128h __A, __m128h __B,
+ const int __R)
+{
+ return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W,
+ (__v8hf) __A,
+ (__v8hf) __B,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmadd_round_sh (__m128h __W, __m128h __A, __m128h __B, __mmask8 __U,
+ const int __R)
+{
+ return (__m128h) __builtin_ia32_vfmaddsh3_mask3 ((__v8hf) __W,
+ (__v8hf) __A,
+ (__v8hf) __B,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmadd_round_sh (__mmask8 __U, __m128h __W, __m128h __A,
+ __m128h __B, const int __R)
+{
+ return (__m128h) __builtin_ia32_vfmaddsh3_maskz ((__v8hf) __W,
+ (__v8hf) __A,
+ (__v8hf) __B,
+ (__mmask8) __U, __R);
+}
+
+#else
+#define _mm_fmadd_round_sh(A, B, C, R) \
+ ((__m128h) __builtin_ia32_vfmaddsh3_mask ((A), (B), (C), (-1), (R)))
+#define _mm_mask_fmadd_round_sh(A, U, B, C, R) \
+ ((__m128h) __builtin_ia32_vfmaddsh3_mask ((A), (B), (C), (U), (R)))
+#define _mm_mask3_fmadd_round_sh(A, B, C, U, R) \
+ ((__m128h) __builtin_ia32_vfmaddsh3_mask3 ((A), (B), (C), (U), (R)))
+#define _mm_maskz_fmadd_round_sh(U, A, B, C, R) \
+ ((__m128h) __builtin_ia32_vfmaddsh3_maskz ((A), (B), (C), (U), (R)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vfnmadd[132,213,231]sh. */
+extern __inline __m128h
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmadd_sh (__m128h __W, __m128h __A, __m128h __B)
+{
+ return (__m128h) __builtin_ia32_vfnmaddsh3_mask ((__v8hf) __W,
+ (__v8hf) __A,
+ (__v8hf) __B,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmadd_sh (__m128h __W, __mmask8 __U, __m128h __A, __m128h __B)
+{
+ return (__m128h) __builtin_ia32_vfnmaddsh3_mask ((__v8hf) __W,
+ (__v8hf) __A,
+ (__v8hf) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmadd_sh (__m128h __W, __m128h __A, __m128h __B, __mmask8 __U)
+{
+ return (__m128h) __builtin_ia32_vfnmaddsh3_mask3 ((__v8hf) __W,
+ (__v8hf) __A,
+ (__v8hf) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmadd_sh (__mmask8 __U, __m128h __W, __m128h __A, __m128h __B)
+{
+ return (__m128h) __builtin_ia32_vfnmaddsh3_maskz ((__v8hf) __W,
+ (__v8hf) __A,
+ (__v8hf) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmadd_round_sh (__m128h __W, __m128h __A, __m128h __B, const int __R)
+{
+ return (__m128h) __builtin_ia32_vfnmaddsh3_mask ((__v8hf) __W,
+ (__v8hf) __A,
+ (__v8hf) __B,
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmadd_round_sh (__m128h __W, __mmask8 __U, __m128h __A, __m128h __B,
+ const int __R)
+{
+ return (__m128h) __builtin_ia32_vfnmaddsh3_mask ((__v8hf) __W,
+ (__v8hf) __A,
+ (__v8hf) __B,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmadd_round_sh (__m128h __W, __m128h __A, __m128h __B, __mmask8 __U,
+ const int __R)
+{
+ return (__m128h) __builtin_ia32_vfnmaddsh3_mask3 ((__v8hf) __W,
+ (__v8hf) __A,
+ (__v8hf) __B,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmadd_round_sh (__mmask8 __U, __m128h __W, __m128h __A,
+ __m128h __B, const int __R)
+{
+ return (__m128h) __builtin_ia32_vfnmaddsh3_maskz ((__v8hf) __W,
+ (__v8hf) __A,
+ (__v8hf) __B,
+ (__mmask8) __U, __R);
+}
+
+#else
+#define _mm_fnmadd_round_sh(A, B, C, R) \
+ ((__m128h) __builtin_ia32_vfnmaddsh3_mask ((A), (B), (C), (-1), (R)))
+#define _mm_mask_fnmadd_round_sh(A, U, B, C, R) \
+ ((__m128h) __builtin_ia32_vfnmaddsh3_mask ((A), (B), (C), (U), (R)))
+#define _mm_mask3_fnmadd_round_sh(A, B, C, U, R) \
+ ((__m128h) __builtin_ia32_vfnmaddsh3_mask3 ((A), (B), (C), (U), (R)))
+#define _mm_maskz_fnmadd_round_sh(U, A, B, C, R) \
+ ((__m128h) __builtin_ia32_vfnmaddsh3_maskz ((A), (B), (C), (U), (R)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vfmsub[132,213,231]sh. */
+extern __inline __m128h
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmsub_sh (__m128h __W, __m128h __A, __m128h __B)
+{
+ return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W,
+ (__v8hf) __A,
+ -(__v8hf) __B,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmsub_sh (__m128h __W, __mmask8 __U, __m128h __A, __m128h __B)
+{
+ return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W,
+ (__v8hf) __A,
+ -(__v8hf) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmsub_sh (__m128h __W, __m128h __A, __m128h __B, __mmask8 __U)
+{
+ return (__m128h) __builtin_ia32_vfmsubsh3_mask3 ((__v8hf) __W,
+ (__v8hf) __A,
+ (__v8hf) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmsub_sh (__mmask8 __U, __m128h __W, __m128h __A, __m128h __B)
+{
+ return (__m128h) __builtin_ia32_vfmaddsh3_maskz ((__v8hf) __W,
+ (__v8hf) __A,
+ -(__v8hf) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmsub_round_sh (__m128h __W, __m128h __A, __m128h __B, const int __R)
+{
+ return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W,
+ (__v8hf) __A,
+ -(__v8hf) __B,
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmsub_round_sh (__m128h __W, __mmask8 __U, __m128h __A, __m128h __B,
+ const int __R)
+{
+ return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W,
+ (__v8hf) __A,
+ -(__v8hf) __B,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmsub_round_sh (__m128h __W, __m128h __A, __m128h __B, __mmask8 __U,
+ const int __R)
+{
+ return (__m128h) __builtin_ia32_vfmsubsh3_mask3 ((__v8hf) __W,
+ (__v8hf) __A,
+ (__v8hf) __B,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmsub_round_sh (__mmask8 __U, __m128h __W, __m128h __A,
+ __m128h __B, const int __R)
+{
+ return (__m128h) __builtin_ia32_vfmaddsh3_maskz ((__v8hf) __W,
+ (__v8hf) __A,
+ -(__v8hf) __B,
+ (__mmask8) __U, __R);
+}
+
+#else
+#define _mm_fmsub_round_sh(A, B, C, R) \
+ ((__m128h) __builtin_ia32_vfmaddsh3_mask ((A), (B), -(C), (-1), (R)))
+#define _mm_mask_fmsub_round_sh(A, U, B, C, R) \
+ ((__m128h) __builtin_ia32_vfmaddsh3_mask ((A), (B), -(C), (U), (R)))
+#define _mm_mask3_fmsub_round_sh(A, B, C, U, R) \
+ ((__m128h) __builtin_ia32_vfmsubsh3_mask3 ((A), (B), (C), (U), (R)))
+#define _mm_maskz_fmsub_round_sh(U, A, B, C, R) \
+ ((__m128h) __builtin_ia32_vfmaddsh3_maskz ((A), (B), -(C), (U), (R)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vfnmsub[132,213,231]sh. */
+extern __inline __m128h
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmsub_sh (__m128h __W, __m128h __A, __m128h __B)
+{
+ return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W,
+ -(__v8hf) __A,
+ -(__v8hf) __B,
+ (__mmask8) -1,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmsub_sh (__m128h __W, __mmask8 __U, __m128h __A, __m128h __B)
+{
+ return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W,
+ -(__v8hf) __A,
+ -(__v8hf) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmsub_sh (__m128h __W, __m128h __A, __m128h __B, __mmask8 __U)
+{
+ return (__m128h) __builtin_ia32_vfmsubsh3_mask3 ((__v8hf) __W,
+ -(__v8hf) __A,
+ (__v8hf) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmsub_sh (__mmask8 __U, __m128h __W, __m128h __A, __m128h __B)
+{
+ return (__m128h) __builtin_ia32_vfmaddsh3_maskz ((__v8hf) __W,
+ -(__v8hf) __A,
+ -(__v8hf) __B,
+ (__mmask8) __U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmsub_round_sh (__m128h __W, __m128h __A, __m128h __B, const int __R)
+{
+ return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W,
+ -(__v8hf) __A,
+ -(__v8hf) __B,
+ (__mmask8) -1,
+ __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmsub_round_sh (__m128h __W, __mmask8 __U, __m128h __A, __m128h __B,
+ const int __R)
+{
+ return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W,
+ -(__v8hf) __A,
+ -(__v8hf) __B,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmsub_round_sh (__m128h __W, __m128h __A, __m128h __B, __mmask8 __U,
+ const int __R)
+{
+ return (__m128h) __builtin_ia32_vfmsubsh3_mask3 ((__v8hf) __W,
+ -(__v8hf) __A,
+ (__v8hf) __B,
+ (__mmask8) __U, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmsub_round_sh (__mmask8 __U, __m128h __W, __m128h __A,
+ __m128h __B, const int __R)
+{
+ return (__m128h) __builtin_ia32_vfmaddsh3_maskz ((__v8hf) __W,
+ -(__v8hf) __A,
+ -(__v8hf) __B,
+ (__mmask8) __U, __R);
+}
+
+#else
+#define _mm_fnmsub_round_sh(A, B, C, R) \
+ ((__m128h) __builtin_ia32_vfmaddsh3_mask ((A), -(B), -(C), (-1), (R)))
+#define _mm_mask_fnmsub_round_sh(A, U, B, C, R) \
+ ((__m128h) __builtin_ia32_vfmaddsh3_mask ((A), -(B), -(C), (U), (R)))
+#define _mm_mask3_fnmsub_round_sh(A, B, C, U, R) \
+ ((__m128h) __builtin_ia32_vfmsubsh3_mask3 ((A), -(B), (C), (U), (R)))
+#define _mm_maskz_fnmsub_round_sh(U, A, B, C, R) \
+ ((__m128h) __builtin_ia32_vfmaddsh3_maskz ((A), -(B), -(C), (U), (R)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vf[,c]maddcph. */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fcmadd_pch (__m512h __A, __m512h __B, __m512h __C)
+{
+ return (__m512h)
+ __builtin_ia32_vfcmaddcph512_round ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fcmadd_pch (__m512h __A, __mmask16 __B, __m512h __C, __m512h __D)
+{
+ return (__m512h)
+ __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) __A,
+ (__v32hf) __C,
+ (__v32hf) __D, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fcmadd_pch (__m512h __A, __m512h __B, __m512h __C, __mmask16 __D)
+{
+ return (__m512h)
+ __builtin_ia32_vfcmaddcph512_mask3_round ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ __D, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fcmadd_pch (__mmask16 __A, __m512h __B, __m512h __C, __m512h __D)
+{
+ return (__m512h)
+ __builtin_ia32_vfcmaddcph512_maskz_round ((__v32hf) __B,
+ (__v32hf) __C,
+ (__v32hf) __D,
+ __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmadd_pch (__m512h __A, __m512h __B, __m512h __C)
+{
+ return (__m512h)
+ __builtin_ia32_vfmaddcph512_round ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmadd_pch (__m512h __A, __mmask16 __B, __m512h __C, __m512h __D)
+{
+ return (__m512h)
+ __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) __A,
+ (__v32hf) __C,
+ (__v32hf) __D, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmadd_pch (__m512h __A, __m512h __B, __m512h __C, __mmask16 __D)
+{
+ return (__m512h)
+ __builtin_ia32_vfmaddcph512_mask3_round ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ __D, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmadd_pch (__mmask16 __A, __m512h __B, __m512h __C, __m512h __D)
+{
+ return (__m512h)
+ __builtin_ia32_vfmaddcph512_maskz_round ((__v32hf) __B,
+ (__v32hf) __C,
+ (__v32hf) __D,
+ __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fcmadd_round_pch (__m512h __A, __m512h __B, __m512h __C, const int __D)
+{
+ return (__m512h)
+ __builtin_ia32_vfcmaddcph512_round ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fcmadd_round_pch (__m512h __A, __mmask16 __B, __m512h __C,
+ __m512h __D, const int __E)
+{
+ return (__m512h)
+ __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) __A,
+ (__v32hf) __C,
+ (__v32hf) __D, __B,
+ __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fcmadd_round_pch (__m512h __A, __m512h __B, __m512h __C,
+ __mmask16 __D, const int __E)
+{
+ return (__m512h)
+ __builtin_ia32_vfcmaddcph512_mask3_round ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ __D, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fcmadd_round_pch (__mmask16 __A, __m512h __B, __m512h __C,
+ __m512h __D, const int __E)
+{
+ return (__m512h)
+ __builtin_ia32_vfcmaddcph512_maskz_round ((__v32hf) __B,
+ (__v32hf) __C,
+ (__v32hf) __D,
+ __A, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmadd_round_pch (__m512h __A, __m512h __B, __m512h __C, const int __D)
+{
+ return (__m512h)
+ __builtin_ia32_vfmaddcph512_round ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmadd_round_pch (__m512h __A, __mmask16 __B, __m512h __C,
+ __m512h __D, const int __E)
+{
+ return (__m512h)
+ __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) __A,
+ (__v32hf) __C,
+ (__v32hf) __D, __B,
+ __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmadd_round_pch (__m512h __A, __m512h __B, __m512h __C,
+ __mmask16 __D, const int __E)
+{
+ return (__m512h)
+ __builtin_ia32_vfmaddcph512_mask3_round ((__v32hf) __A,
+ (__v32hf) __B,
+ (__v32hf) __C,
+ __D, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmadd_round_pch (__mmask16 __A, __m512h __B, __m512h __C,
+ __m512h __D, const int __E)
+{
+ return (__m512h)
+ __builtin_ia32_vfmaddcph512_maskz_round ((__v32hf) __B,
+ (__v32hf) __C,
+ (__v32hf) __D,
+ __A, __E);
+}
+
+#else
+#define _mm512_fcmadd_round_pch(A, B, C, D) \
+ (__m512h) __builtin_ia32_vfcmaddcph512_round ((A), (B), (C), (D))
+
+#define _mm512_mask_fcmadd_round_pch(A, B, C, D, E) \
+ ((__m512h) \
+ __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) (A), \
+ (__v32hf) (C), \
+ (__v32hf) (D), \
+ (B), (E)))
+
+
+#define _mm512_mask3_fcmadd_round_pch(A, B, C, D, E) \
+ ((__m512h) \
+ __builtin_ia32_vfcmaddcph512_mask3_round ((A), (B), (C), (D), (E)))
+
+#define _mm512_maskz_fcmadd_round_pch(A, B, C, D, E) \
+ (__m512h) \
+ __builtin_ia32_vfcmaddcph512_maskz_round ((B), (C), (D), (A), (E))
+
+#define _mm512_fmadd_round_pch(A, B, C, D) \
+ (__m512h) __builtin_ia32_vfmaddcph512_round ((A), (B), (C), (D))
+
+#define _mm512_mask_fmadd_round_pch(A, B, C, D, E) \
+ ((__m512h) \
+ __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) (A), \
+ (__v32hf) (C), \
+ (__v32hf) (D), \
+ (B), (E)))
+
+#define _mm512_mask3_fmadd_round_pch(A, B, C, D, E) \
+ (__m512h) \
+ __builtin_ia32_vfmaddcph512_mask3_round ((A), (B), (C), (D), (E))
+
+#define _mm512_maskz_fmadd_round_pch(A, B, C, D, E) \
+ (__m512h) \
+ __builtin_ia32_vfmaddcph512_maskz_round ((B), (C), (D), (A), (E))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vf[,c]mulcph. */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fcmul_pch (__m512h __A, __m512h __B)
+{
+ return (__m512h)
+ __builtin_ia32_vfcmulcph512_round ((__v32hf) __A,
+ (__v32hf) __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fcmul_pch (__m512h __A, __mmask16 __B, __m512h __C, __m512h __D)
+{
+ return (__m512h)
+ __builtin_ia32_vfcmulcph512_mask_round ((__v32hf) __C,
+ (__v32hf) __D,
+ (__v32hf) __A,
+ __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fcmul_pch (__mmask16 __A, __m512h __B, __m512h __C)
+{
+ return (__m512h)
+ __builtin_ia32_vfcmulcph512_mask_round ((__v32hf) __B,
+ (__v32hf) __C,
+ _mm512_setzero_ph (),
+ __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmul_pch (__m512h __A, __m512h __B)
+{
+ return (__m512h)
+ __builtin_ia32_vfmulcph512_round ((__v32hf) __A,
+ (__v32hf) __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmul_pch (__m512h __A, __mmask16 __B, __m512h __C, __m512h __D)
+{
+ return (__m512h)
+ __builtin_ia32_vfmulcph512_mask_round ((__v32hf) __C,
+ (__v32hf) __D,
+ (__v32hf) __A,
+ __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmul_pch (__mmask16 __A, __m512h __B, __m512h __C)
+{
+ return (__m512h)
+ __builtin_ia32_vfmulcph512_mask_round ((__v32hf) __B,
+ (__v32hf) __C,
+ _mm512_setzero_ph (),
+ __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fcmul_round_pch (__m512h __A, __m512h __B, const int __D)
+{
+ return (__m512h)
+ __builtin_ia32_vfcmulcph512_round ((__v32hf) __A,
+ (__v32hf) __B, __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fcmul_round_pch (__m512h __A, __mmask16 __B, __m512h __C,
+ __m512h __D, const int __E)
+{
+ return (__m512h)
+ __builtin_ia32_vfcmulcph512_mask_round ((__v32hf) __C,
+ (__v32hf) __D,
+ (__v32hf) __A,
+ __B, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fcmul_round_pch (__mmask16 __A, __m512h __B,
+ __m512h __C, const int __E)
+{
+ return (__m512h)
+ __builtin_ia32_vfcmulcph512_mask_round ((__v32hf) __B,
+ (__v32hf) __C,
+ _mm512_setzero_ph (),
+ __A, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmul_round_pch (__m512h __A, __m512h __B, const int __D)
+{
+ return (__m512h)
+ __builtin_ia32_vfmulcph512_round ((__v32hf) __A,
+ (__v32hf) __B,
+ __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmul_round_pch (__m512h __A, __mmask16 __B, __m512h __C,
+ __m512h __D, const int __E)
+{
+ return (__m512h)
+ __builtin_ia32_vfmulcph512_mask_round ((__v32hf) __C,
+ (__v32hf) __D,
+ (__v32hf) __A,
+ __B, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmul_round_pch (__mmask16 __A, __m512h __B,
+ __m512h __C, const int __E)
+{
+ return (__m512h)
+ __builtin_ia32_vfmulcph512_mask_round ((__v32hf) __B,
+ (__v32hf) __C,
+ _mm512_setzero_ph (),
+ __A, __E);
+}
+
+#else
+#define _mm512_fcmul_round_pch(A, B, D) \
+ (__m512h) __builtin_ia32_vfcmulcph512_round ((A), (B), (D))
+
+#define _mm512_mask_fcmul_round_pch(A, B, C, D, E) \
+ (__m512h) __builtin_ia32_vfcmulcph512_mask_round ((C), (D), (A), (B), (E))
+
+#define _mm512_maskz_fcmul_round_pch(A, B, C, E) \
+ (__m512h) __builtin_ia32_vfcmulcph512_mask_round ((B), (C), \
+ (__v32hf) \
+ _mm512_setzero_ph (), \
+ (A), (E))
+
+#define _mm512_fmul_round_pch(A, B, D) \
+ (__m512h) __builtin_ia32_vfmulcph512_round ((A), (B), (D))
+
+#define _mm512_mask_fmul_round_pch(A, B, C, D, E) \
+ (__m512h) __builtin_ia32_vfmulcph512_mask_round ((C), (D), (A), (B), (E))
+
+#define _mm512_maskz_fmul_round_pch(A, B, C, E) \
+ (__m512h) __builtin_ia32_vfmulcph512_mask_round ((B), (C), \
+ (__v32hf) \
+ _mm512_setzero_ph (), \
+ (A), (E))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vf[,c]maddcsh. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fcmadd_sch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return (__m128h)
+ __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A,
+ (__v8hf) __C,
+ (__v8hf) __D, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fcmadd_sch (__m128h __A, __m128h __B, __m128h __C, __mmask8 __D)
+{
+ return (__m128h)
+ __builtin_ia32_vfcmaddcsh_mask3_round ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C, __D,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fcmadd_sch (__mmask8 __A, __m128h __B, __m128h __C, __m128h __D)
+{
+ return (__m128h)
+ __builtin_ia32_vfcmaddcsh_maskz_round ((__v8hf) __B,
+ (__v8hf) __C,
+ (__v8hf) __D,
+ __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fcmadd_sch (__m128h __A, __m128h __B, __m128h __C)
+{
+ return (__m128h)
+ __builtin_ia32_vfcmaddcsh_round ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmadd_sch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return (__m128h)
+ __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A,
+ (__v8hf) __C,
+ (__v8hf) __D, __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmadd_sch (__m128h __A, __m128h __B, __m128h __C, __mmask8 __D)
+{
+ return (__m128h)
+ __builtin_ia32_vfmaddcsh_mask3_round ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C, __D,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmadd_sch (__mmask8 __A, __m128h __B, __m128h __C, __m128h __D)
+{
+ return (__m128h)
+ __builtin_ia32_vfmaddcsh_maskz_round ((__v8hf) __B,
+ (__v8hf) __C,
+ (__v8hf) __D,
+ __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_sch (__m128h __A, __m128h __B, __m128h __C)
+{
+ return (__m128h)
+ __builtin_ia32_vfmaddcsh_round ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fcmadd_round_sch (__m128h __A, __mmask8 __B, __m128h __C,
+ __m128h __D, const int __E)
+{
+ return (__m128h)
+ __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A,
+ (__v8hf) __C,
+ (__v8hf) __D,
+ __B, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fcmadd_round_sch (__m128h __A, __m128h __B, __m128h __C,
+ __mmask8 __D, const int __E)
+{
+ return (__m128h)
+ __builtin_ia32_vfcmaddcsh_mask3_round ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ __D, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fcmadd_round_sch (__mmask8 __A, __m128h __B, __m128h __C,
+ __m128h __D, const int __E)
+{
+ return (__m128h)
+ __builtin_ia32_vfcmaddcsh_maskz_round ((__v8hf) __B,
+ (__v8hf) __C,
+ (__v8hf) __D,
+ __A, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fcmadd_round_sch (__m128h __A, __m128h __B, __m128h __C, const int __D)
+{
+ return (__m128h)
+ __builtin_ia32_vfcmaddcsh_round ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmadd_round_sch (__m128h __A, __mmask8 __B, __m128h __C,
+ __m128h __D, const int __E)
+{
+ return (__m128h)
+ __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A,
+ (__v8hf) __C,
+ (__v8hf) __D,
+ __B, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmadd_round_sch (__m128h __A, __m128h __B, __m128h __C,
+ __mmask8 __D, const int __E)
+{
+ return (__m128h)
+ __builtin_ia32_vfmaddcsh_mask3_round ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ __D, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmadd_round_sch (__mmask8 __A, __m128h __B, __m128h __C,
+ __m128h __D, const int __E)
+{
+ return (__m128h)
+ __builtin_ia32_vfmaddcsh_maskz_round ((__v8hf) __B,
+ (__v8hf) __C,
+ (__v8hf) __D,
+ __A, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_round_sch (__m128h __A, __m128h __B, __m128h __C, const int __D)
+{
+ return (__m128h)
+ __builtin_ia32_vfmaddcsh_round ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ __D);
+}
+#else
+#define _mm_mask_fcmadd_round_sch(A, B, C, D, E) \
+ ((__m128h) \
+ __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) (A), \
+ (__v8hf) (C), \
+ (__v8hf) (D), \
+ (B), (E)))
+
+
+#define _mm_mask3_fcmadd_round_sch(A, B, C, D, E) \
+ ((__m128h) \
+ __builtin_ia32_vfcmaddcsh_mask3_round ((__v8hf) (A), \
+ (__v8hf) (B), \
+ (__v8hf) (C), \
+ (D), (E)))
+
+#define _mm_maskz_fcmadd_round_sch(A, B, C, D, E) \
+ __builtin_ia32_vfcmaddcsh_maskz_round ((B), (C), (D), (A), (E))
+
+#define _mm_fcmadd_round_sch(A, B, C, D) \
+ __builtin_ia32_vfcmaddcsh_round ((A), (B), (C), (D))
+
+#define _mm_mask_fmadd_round_sch(A, B, C, D, E) \
+ ((__m128h) \
+ __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) (A), \
+ (__v8hf) (C), \
+ (__v8hf) (D), \
+ (B), (E)))
+
+#define _mm_mask3_fmadd_round_sch(A, B, C, D, E) \
+ ((__m128h) \
+ __builtin_ia32_vfmaddcsh_mask3_round ((__v8hf) (A), \
+ (__v8hf) (B), \
+ (__v8hf) (C), \
+ (D), (E)))
+
+#define _mm_maskz_fmadd_round_sch(A, B, C, D, E) \
+ __builtin_ia32_vfmaddcsh_maskz_round ((B), (C), (D), (A), (E))
+
+#define _mm_fmadd_round_sch(A, B, C, D) \
+ __builtin_ia32_vfmaddcsh_round ((A), (B), (C), (D))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vf[,c]mulcsh. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fcmul_sch (__m128h __A, __m128h __B)
+{
+ return (__m128h)
+ __builtin_ia32_vfcmulcsh_round ((__v8hf) __A,
+ (__v8hf) __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fcmul_sch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return (__m128h)
+ __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __C,
+ (__v8hf) __D,
+ (__v8hf) __A,
+ __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fcmul_sch (__mmask8 __A, __m128h __B, __m128h __C)
+{
+ return (__m128h)
+ __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __B,
+ (__v8hf) __C,
+ _mm_setzero_ph (),
+ __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmul_sch (__m128h __A, __m128h __B)
+{
+ return (__m128h)
+ __builtin_ia32_vfmulcsh_round ((__v8hf) __A,
+ (__v8hf) __B,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmul_sch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return (__m128h)
+ __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __C,
+ (__v8hf) __D,
+ (__v8hf) __A,
+ __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmul_sch (__mmask8 __A, __m128h __B, __m128h __C)
+{
+ return (__m128h)
+ __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __B,
+ (__v8hf) __C,
+ _mm_setzero_ph (),
+ __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fcmul_round_sch (__m128h __A, __m128h __B, const int __D)
+{
+ return (__m128h)
+ __builtin_ia32_vfcmulcsh_round ((__v8hf) __A,
+ (__v8hf) __B,
+ __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fcmul_round_sch (__m128h __A, __mmask8 __B, __m128h __C,
+ __m128h __D, const int __E)
+{
+ return (__m128h)
+ __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __C,
+ (__v8hf) __D,
+ (__v8hf) __A,
+ __B, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fcmul_round_sch (__mmask8 __A, __m128h __B, __m128h __C,
+ const int __E)
+{
+ return (__m128h)
+ __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __B,
+ (__v8hf) __C,
+ _mm_setzero_ph (),
+ __A, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmul_round_sch (__m128h __A, __m128h __B, const int __D)
+{
+ return (__m128h)
+ __builtin_ia32_vfmulcsh_round ((__v8hf) __A,
+ (__v8hf) __B, __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmul_round_sch (__m128h __A, __mmask8 __B, __m128h __C,
+ __m128h __D, const int __E)
+{
+ return (__m128h)
+ __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __C,
+ (__v8hf) __D,
+ (__v8hf) __A,
+ __B, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmul_round_sch (__mmask8 __A, __m128h __B, __m128h __C, const int __E)
+{
+ return (__m128h)
+ __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __B,
+ (__v8hf) __C,
+ _mm_setzero_ph (),
+ __A, __E);
+}
+
+#else
+#define _mm_fcmul_round_sch(__A, __B, __D) \
+ (__m128h) __builtin_ia32_vfcmulcsh_round ((__v8hf) __A, \
+ (__v8hf) __B, __D)
+
+#define _mm_mask_fcmul_round_sch(__A, __B, __C, __D, __E) \
+ (__m128h) __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __C, \
+ (__v8hf) __D, \
+ (__v8hf) __A, \
+ __B, __E)
+
+#define _mm_maskz_fcmul_round_sch(__A, __B, __C, __E) \
+ (__m128h) __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __B, \
+ (__v8hf) __C, \
+ _mm_setzero_ph (), \
+ __A, __E)
+
+#define _mm_fmul_round_sch(__A, __B, __D) \
+ (__m128h) __builtin_ia32_vfmulcsh_round ((__v8hf) __A, \
+ (__v8hf) __B, __D)
+
+#define _mm_mask_fmul_round_sch(__A, __B, __C, __D, __E) \
+ (__m128h) __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __C, \
+ (__v8hf) __D, \
+ (__v8hf) __A, \
+ __B, __E)
+
+#define _mm_maskz_fmul_round_sch(__A, __B, __C, __E) \
+ (__m128h) __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __B, \
+ (__v8hf) __C, \
+ _mm_setzero_ph (), \
+ __A, __E)
+
+#endif /* __OPTIMIZE__ */
+
+#define _MM512_REDUCE_OP(op) \
+ __m256h __T1 = (__m256h) _mm512_extractf64x4_pd ((__m512d) __A, 0); \
+ __m256h __T2 = (__m256h) _mm512_extractf64x4_pd ((__m512d) __A, 1); \
+ __m256h __T3 = (__T1 op __T2); \
+ __m128h __T4 = (__m128h) _mm256_extractf128_pd ((__m256d) __T3, 0); \
+ __m128h __T5 = (__m128h) _mm256_extractf128_pd ((__m256d) __T3, 1); \
+ __m128h __T6 = (__T4 op __T5); \
+ __m128h __T7 = (__m128h) __builtin_shuffle ((__m128h)__T6, \
+ (__v8hi) { 4, 5, 6, 7, 0, 1, 2, 3 }); \
+ __m128h __T8 = (__T6 op __T7); \
+ __m128h __T9 = (__m128h) __builtin_shuffle ((__m128h)__T8, \
+ (__v8hi) { 2, 3, 0, 1, 4, 5, 6, 7 }); \
+ __m128h __T10 = __T8 op __T9; \
+ return __T10[0] op __T10[1]
+
+// TODO reduce
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_add_ph (__m512h __A)
+{
+ _MM512_REDUCE_OP (+);
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_mul_ph (__m512h __A)
+{
+ _MM512_REDUCE_OP (*);
+}
+
+#undef _MM512_REDUCE_OP
+
+#ifdef __AVX512VL__
+
+#define _MM512_REDUCE_OP(op) \
+ __m256h __T1 = (__m256h) _mm512_extractf64x4_pd ((__m512d) __A, 0); \
+ __m256h __T2 = (__m256h) _mm512_extractf64x4_pd ((__m512d) __A, 1); \
+ __m256h __T3 = __builtin_ia32_##op##ph256_mask (__T1, __T2, \
+ _mm256_setzero_ph (), (__mmask16) -1); \
+ __m128h __T4 = (__m128h) _mm256_extractf128_pd ((__m256d) __T3, 0); \
+ __m128h __T5 = (__m128h) _mm256_extractf128_pd ((__m256d) __T3, 1); \
+ __m128h __T6 = __builtin_ia32_##op##ph128_mask \
+ (__T4, __T5, _mm_setzero_ph (),(__mmask8) -1); \
+ __m128h __T7 = (__m128h) __builtin_shuffle ((__m128h)__T6, \
+ (__v8hi) { 2, 3, 0, 1, 6, 7, 4, 5 }); \
+ __m128h __T8 = (__m128h) __builtin_ia32_##op##ph128_mask \
+ (__T6, __T7, _mm_setzero_ph (),(__mmask8) -1); \
+ __m128h __T9 = (__m128h) __builtin_shuffle ((__m128h)__T8, \
+ (__v8hi) { 4, 5 }); \
+ __m128h __T10 = __builtin_ia32_##op##ph128_mask \
+ (__T8, __T9, _mm_setzero_ph (),(__mmask8) -1); \
+ __m128h __T11 = (__m128h) __builtin_shuffle (__T10, \
+ (__v8hi) { 1, 0 }); \
+ __m128h __T12 = __builtin_ia32_##op##ph128_mask \
+ (__T10, __T11, _mm_setzero_ph (),(__mmask8) -1); \
+ return __T12[0]
+
+#else
+
+#define _MM512_REDUCE_OP(op) \
+ __m512h __T1 = (__m512h) __builtin_shuffle ((__m512d) __A, \
+ (__v8di) { 4, 5, 6, 7, 0, 0, 0, 0 }); \
+ __m512h __T2 = _mm512_##op##_ph (__A, __T1); \
+ __m512h __T3 = (__m512h) __builtin_shuffle ((__m512d) __T2, \
+ (__v8di) { 2, 3, 0, 0, 0, 0, 0, 0 }); \
+ __m512h __T4 = _mm512_##op##_ph (__T2, __T3); \
+ __m512h __T5 = (__m512h) __builtin_shuffle ((__m512d) __T4, \
+ (__v8di) { 1, 0, 0, 0, 0, 0, 0, 0 }); \
+ __m512h __T6 = _mm512_##op##_ph (__T4, __T5); \
+ __m512h __T7 = (__m512h) __builtin_shuffle ((__m512) __T6, \
+ (__v16si) { 1, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0 }); \
+ __m512h __T8 = _mm512_##op##_ph (__T6, __T7); \
+ __m512h __T9 = (__m512h) __builtin_shuffle (__T8, \
+ (__v32hi) { 1, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0, \
+ 0, 0, 0, 0, 0, 0, 0, 0 }); \
+ __m512h __T10 = _mm512_##op##_ph (__T8, __T9); \
+ return __T10[0]
+#endif
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_min_ph (__m512h __A)
+{
+ _MM512_REDUCE_OP (min);
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_max_ph (__m512h __A)
+{
+ _MM512_REDUCE_OP (max);
+}
+
+#undef _MM512_REDUCE_OP
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_blend_ph (__mmask32 __U, __m512h __A, __m512h __W)
+{
+ return (__m512h) __builtin_ia32_movdquhi512_mask ((__v32hi) __W,
+ (__v32hi) __A,
+ (__mmask32) __U);
+
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutex2var_ph (__m512h __A, __m512i __I, __m512h __B)
+{
+ return (__m512h) __builtin_ia32_vpermi2varhi512_mask ((__v32hi) __A,
+ (__v32hi) __I,
+ (__v32hi) __B,
+ (__mmask32)-1);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutexvar_ph (__m512i __A, __m512h __B)
+{
+ return (__m512h) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
+ (__v32hi) __A,
+ (__v32hi)
+ (_mm512_setzero_ph ()),
+ (__mmask32)-1);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set1_pch (_Float16 _Complex __A)
+{
+ union
+ {
+ _Float16 _Complex __a;
+ float __b;
+ } __u = { .__a = __A};
+
+ return (__m512h) _mm512_set1_ps (__u.__b);
+}
+
+// intrinsics below are alias for f*mul_*ch
+#define _mm512_mul_pch(A, B) _mm512_fmul_pch ((A), (B))
+#define _mm512_mask_mul_pch(W, U, A, B) \
+ _mm512_mask_fmul_pch ((W), (U), (A), (B))
+#define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch ((U), (A), (B))
+#define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch ((A), (B), (R))
+#define _mm512_mask_mul_round_pch(W, U, A, B, R) \
+ _mm512_mask_fmul_round_pch ((W), (U), (A), (B), (R))
+#define _mm512_maskz_mul_round_pch(U, A, B, R) \
+ _mm512_maskz_fmul_round_pch ((U), (A), (B), (R))
+
+#define _mm512_cmul_pch(A, B) _mm512_fcmul_pch ((A), (B))
+#define _mm512_mask_cmul_pch(W, U, A, B) \
+ _mm512_mask_fcmul_pch ((W), (U), (A), (B))
+#define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch ((U), (A), (B))
+#define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch ((A), (B), (R))
+#define _mm512_mask_cmul_round_pch(W, U, A, B, R) \
+ _mm512_mask_fcmul_round_pch ((W), (U), (A), (B), (R))
+#define _mm512_maskz_cmul_round_pch(U, A, B, R) \
+ _mm512_maskz_fcmul_round_pch ((U), (A), (B), (R))
+
+#define _mm_mul_sch(A, B) _mm_fmul_sch ((A), (B))
+#define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch ((W), (U), (A), (B))
+#define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch ((U), (A), (B))
+#define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch ((A), (B), (R))
+#define _mm_mask_mul_round_sch(W, U, A, B, R) \
+ _mm_mask_fmul_round_sch ((W), (U), (A), (B), (R))
+#define _mm_maskz_mul_round_sch(U, A, B, R) \
+ _mm_maskz_fmul_round_sch ((U), (A), (B), (R))
+
+#define _mm_cmul_sch(A, B) _mm_fcmul_sch ((A), (B))
+#define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch ((W), (U), (A), (B))
+#define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch ((U), (A), (B))
+#define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch ((A), (B), (R))
+#define _mm_mask_cmul_round_sch(W, U, A, B, R) \
+ _mm_mask_fcmul_round_sch ((W), (U), (A), (B), (R))
+#define _mm_maskz_cmul_round_sch(U, A, B, R) \
+ _mm_maskz_fcmul_round_sch ((U), (A), (B), (R))
+
+#ifdef __DISABLE_AVX512FP16__
+#undef __DISABLE_AVX512FP16__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512FP16__ */
+
+#endif /* __AVX512FP16INTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2019-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512fp16vlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512FP16VLINTRIN_H_INCLUDED
+#define __AVX512FP16VLINTRIN_H_INCLUDED
+
+#if !defined(__AVX512VL__) || !defined(__AVX512FP16__)
+#pragma GCC push_options
+#pragma GCC target("avx512fp16,avx512vl")
+#define __DISABLE_AVX512FP16VL__
+#endif /* __AVX512FP16VL__ */
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castph_ps (__m128h __a)
+{
+ return (__m128) __a;
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castph_ps (__m256h __a)
+{
+ return (__m256) __a;
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castph_pd (__m128h __a)
+{
+ return (__m128d) __a;
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castph_pd (__m256h __a)
+{
+ return (__m256d) __a;
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castph_si128 (__m128h __a)
+{
+ return (__m128i) __a;
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castph_si256 (__m256h __a)
+{
+ return (__m256i) __a;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castps_ph (__m128 __a)
+{
+ return (__m128h) __a;
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castps_ph (__m256 __a)
+{
+ return (__m256h) __a;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castpd_ph (__m128d __a)
+{
+ return (__m128h) __a;
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castpd_ph (__m256d __a)
+{
+ return (__m256h) __a;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castsi128_ph (__m128i __a)
+{
+ return (__m128h) __a;
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castsi256_ph (__m256i __a)
+{
+ return (__m256h) __a;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castph256_ph128 (__m256h __A)
+{
+ union
+ {
+ __m128h __a[2];
+ __m256h __v;
+ } __u = { .__v = __A };
+ return __u.__a[0];
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castph128_ph256 (__m128h __A)
+{
+ union
+ {
+ __m128h __a[2];
+ __m256h __v;
+ } __u;
+ __u.__a[0] = __A;
+ return __u.__v;
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_zextph128_ph256 (__m128h __A)
+{
+ return (__m256h) _mm256_insertf128_ps (_mm256_setzero_ps (),
+ (__m128) __A, 0);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_conj_pch (__m256h __A)
+{
+ return (__m256h) _mm256_xor_epi32 ((__m256i) __A, _mm256_set1_epi32 (1<<31));
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_conj_pch (__m256h __W, __mmask8 __U, __m256h __A)
+{
+ return (__m256h) __builtin_ia32_movaps256_mask ((__v8sf)
+ _mm256_conj_pch (__A),
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_conj_pch (__mmask8 __U, __m256h __A)
+{
+ return (__m256h) __builtin_ia32_movaps256_mask ((__v8sf)
+ _mm256_conj_pch (__A),
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_conj_pch (__m128h __A)
+{
+ return (__m128h) _mm_xor_epi32 ((__m128i) __A, _mm_set1_epi32 (1<<31));
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_conj_pch (__m128h __W, __mmask8 __U, __m128h __A)
+{
+ return (__m128h) __builtin_ia32_movaps128_mask ((__v4sf) _mm_conj_pch (__A),
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_conj_pch (__mmask8 __U, __m128h __A)
+{
+ return (__m128h) __builtin_ia32_movaps128_mask ((__v4sf) _mm_conj_pch (__A),
+ (__v4sf) _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+/* Intrinsics v[add,sub,mul,div]ph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_ph (__m128h __A, __m128h __B)
+{
+ return (__m128h) ((__v8hf) __A + (__v8hf) __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_add_ph (__m256h __A, __m256h __B)
+{
+ return (__m256h) ((__v16hf) __A + (__v16hf) __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_add_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return __builtin_ia32_addph128_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_add_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
+{
+ return __builtin_ia32_addph256_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_add_ph (__mmask8 __A, __m128h __B, __m128h __C)
+{
+ return __builtin_ia32_addph128_mask (__B, __C, _mm_setzero_ph (),
+ __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_add_ph (__mmask16 __A, __m256h __B, __m256h __C)
+{
+ return __builtin_ia32_addph256_mask (__B, __C,
+ _mm256_setzero_ph (), __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_ph (__m128h __A, __m128h __B)
+{
+ return (__m128h) ((__v8hf) __A - (__v8hf) __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sub_ph (__m256h __A, __m256h __B)
+{
+ return (__m256h) ((__v16hf) __A - (__v16hf) __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sub_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return __builtin_ia32_subph128_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sub_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
+{
+ return __builtin_ia32_subph256_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sub_ph (__mmask8 __A, __m128h __B, __m128h __C)
+{
+ return __builtin_ia32_subph128_mask (__B, __C, _mm_setzero_ph (),
+ __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sub_ph (__mmask16 __A, __m256h __B, __m256h __C)
+{
+ return __builtin_ia32_subph256_mask (__B, __C,
+ _mm256_setzero_ph (), __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_ph (__m128h __A, __m128h __B)
+{
+ return (__m128h) ((__v8hf) __A * (__v8hf) __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mul_ph (__m256h __A, __m256h __B)
+{
+ return (__m256h) ((__v16hf) __A * (__v16hf) __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mul_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return __builtin_ia32_mulph128_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mul_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
+{
+ return __builtin_ia32_mulph256_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mul_ph (__mmask8 __A, __m128h __B, __m128h __C)
+{
+ return __builtin_ia32_mulph128_mask (__B, __C, _mm_setzero_ph (),
+ __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mul_ph (__mmask16 __A, __m256h __B, __m256h __C)
+{
+ return __builtin_ia32_mulph256_mask (__B, __C,
+ _mm256_setzero_ph (), __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_ph (__m128h __A, __m128h __B)
+{
+ return (__m128h) ((__v8hf) __A / (__v8hf) __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_div_ph (__m256h __A, __m256h __B)
+{
+ return (__m256h) ((__v16hf) __A / (__v16hf) __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_div_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return __builtin_ia32_divph128_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_div_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
+{
+ return __builtin_ia32_divph256_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_div_ph (__mmask8 __A, __m128h __B, __m128h __C)
+{
+ return __builtin_ia32_divph128_mask (__B, __C, _mm_setzero_ph (),
+ __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_div_ph (__mmask16 __A, __m256h __B, __m256h __C)
+{
+ return __builtin_ia32_divph256_mask (__B, __C,
+ _mm256_setzero_ph (), __A);
+}
+
+/* Intrinsics v[max,min]ph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_ph (__m128h __A, __m128h __B)
+{
+ return __builtin_ia32_maxph128_mask (__A, __B,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_ph (__m256h __A, __m256h __B)
+{
+ return __builtin_ia32_maxph256_mask (__A, __B,
+ _mm256_setzero_ph (),
+ (__mmask16) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return __builtin_ia32_maxph128_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_max_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
+{
+ return __builtin_ia32_maxph256_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_ph (__mmask8 __A, __m128h __B, __m128h __C)
+{
+ return __builtin_ia32_maxph128_mask (__B, __C, _mm_setzero_ph (),
+ __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_max_ph (__mmask16 __A, __m256h __B, __m256h __C)
+{
+ return __builtin_ia32_maxph256_mask (__B, __C,
+ _mm256_setzero_ph (), __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_ph (__m128h __A, __m128h __B)
+{
+ return __builtin_ia32_minph128_mask (__A, __B,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_ph (__m256h __A, __m256h __B)
+{
+ return __builtin_ia32_minph256_mask (__A, __B,
+ _mm256_setzero_ph (),
+ (__mmask16) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return __builtin_ia32_minph128_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_min_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
+{
+ return __builtin_ia32_minph256_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_ph (__mmask8 __A, __m128h __B, __m128h __C)
+{
+ return __builtin_ia32_minph128_mask (__B, __C, _mm_setzero_ph (),
+ __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_min_ph (__mmask16 __A, __m256h __B, __m256h __C)
+{
+ return __builtin_ia32_minph256_mask (__B, __C,
+ _mm256_setzero_ph (), __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_ph (__m128h __A)
+{
+ return (__m128h) _mm_and_si128 ( _mm_set1_epi32 (0x7FFF7FFF),
+ (__m128i) __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_abs_ph (__m256h __A)
+{
+ return (__m256h) _mm256_and_si256 ( _mm256_set1_epi32 (0x7FFF7FFF),
+ (__m256i) __A);
+}
+
+/* vcmpph */
+#ifdef __OPTIMIZE
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_ph_mask (__m128h __A, __m128h __B, const int __C)
+{
+ return (__mmask8) __builtin_ia32_cmpph128_mask (__A, __B, __C,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_ph_mask (__mmask8 __A, __m128h __B, __m128h __C,
+ const int __D)
+{
+ return (__mmask8) __builtin_ia32_cmpph128_mask (__B, __C, __D, __A);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_ph_mask (__m256h __A, __m256h __B, const int __C)
+{
+ return (__mmask16) __builtin_ia32_cmpph256_mask (__A, __B, __C,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_ph_mask (__mmask16 __A, __m256h __B, __m256h __C,
+ const int __D)
+{
+ return (__mmask16) __builtin_ia32_cmpph256_mask (__B, __C, __D,
+ __A);
+}
+
+#else
+#define _mm_cmp_ph_mask(A, B, C) \
+ (__builtin_ia32_cmpph128_mask ((A), (B), (C), (-1)))
+
+#define _mm_mask_cmp_ph_mask(A, B, C, D) \
+ (__builtin_ia32_cmpph128_mask ((B), (C), (D), (A)))
+
+#define _mm256_cmp_ph_mask(A, B, C) \
+ (__builtin_ia32_cmpph256_mask ((A), (B), (C), (-1)))
+
+#define _mm256_mask_cmp_ph_mask(A, B, C, D) \
+ (__builtin_ia32_cmpph256_mask ((B), (C), (D), (A)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vsqrtph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_ph (__m128h __A)
+{
+ return __builtin_ia32_sqrtph128_mask (__A, _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sqrt_ph (__m256h __A)
+{
+ return __builtin_ia32_sqrtph256_mask (__A, _mm256_setzero_ph (),
+ (__mmask16) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sqrt_ph (__m128h __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_sqrtph128_mask (__C, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sqrt_ph (__m256h __A, __mmask16 __B, __m256h __C)
+{
+ return __builtin_ia32_sqrtph256_mask (__C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sqrt_ph (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_sqrtph128_mask (__B, _mm_setzero_ph (),
+ __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sqrt_ph (__mmask16 __A, __m256h __B)
+{
+ return __builtin_ia32_sqrtph256_mask (__B, _mm256_setzero_ph (),
+ __A);
+}
+
+/* Intrinsics vrsqrtph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt_ph (__m128h __A)
+{
+ return __builtin_ia32_rsqrtph128_mask (__A, _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rsqrt_ph (__m256h __A)
+{
+ return __builtin_ia32_rsqrtph256_mask (__A, _mm256_setzero_ph (),
+ (__mmask16) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rsqrt_ph (__m128h __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_rsqrtph128_mask (__C, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_rsqrt_ph (__m256h __A, __mmask16 __B, __m256h __C)
+{
+ return __builtin_ia32_rsqrtph256_mask (__C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rsqrt_ph (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_rsqrtph128_mask (__B, _mm_setzero_ph (), __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_rsqrt_ph (__mmask16 __A, __m256h __B)
+{
+ return __builtin_ia32_rsqrtph256_mask (__B, _mm256_setzero_ph (),
+ __A);
+}
+
+/* Intrinsics vrcpph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp_ph (__m128h __A)
+{
+ return __builtin_ia32_rcpph128_mask (__A, _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rcp_ph (__m256h __A)
+{
+ return __builtin_ia32_rcpph256_mask (__A, _mm256_setzero_ph (),
+ (__mmask16) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rcp_ph (__m128h __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_rcpph128_mask (__C, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_rcp_ph (__m256h __A, __mmask16 __B, __m256h __C)
+{
+ return __builtin_ia32_rcpph256_mask (__C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rcp_ph (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_rcpph128_mask (__B, _mm_setzero_ph (), __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_rcp_ph (__mmask16 __A, __m256h __B)
+{
+ return __builtin_ia32_rcpph256_mask (__B, _mm256_setzero_ph (),
+ __A);
+}
+
+/* Intrinsics vscalefph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_scalef_ph (__m128h __A, __m128h __B)
+{
+ return __builtin_ia32_scalefph128_mask (__A, __B,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_scalef_ph (__m256h __A, __m256h __B)
+{
+ return __builtin_ia32_scalefph256_mask (__A, __B,
+ _mm256_setzero_ph (),
+ (__mmask16) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_scalef_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return __builtin_ia32_scalefph128_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_scalef_ph (__m256h __A, __mmask16 __B, __m256h __C,
+ __m256h __D)
+{
+ return __builtin_ia32_scalefph256_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_scalef_ph (__mmask8 __A, __m128h __B, __m128h __C)
+{
+ return __builtin_ia32_scalefph128_mask (__B, __C,
+ _mm_setzero_ph (), __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_scalef_ph (__mmask16 __A, __m256h __B, __m256h __C)
+{
+ return __builtin_ia32_scalefph256_mask (__B, __C,
+ _mm256_setzero_ph (),
+ __A);
+}
+
+/* Intrinsics vreduceph. */
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_ph (__m128h __A, int __B)
+{
+ return __builtin_ia32_reduceph128_mask (__A, __B,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_ph (__m128h __A, __mmask8 __B, __m128h __C, int __D)
+{
+ return __builtin_ia32_reduceph128_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_reduce_ph (__mmask8 __A, __m128h __B, int __C)
+{
+ return __builtin_ia32_reduceph128_mask (__B, __C,
+ _mm_setzero_ph (), __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_ph (__m256h __A, int __B)
+{
+ return __builtin_ia32_reduceph256_mask (__A, __B,
+ _mm256_setzero_ph (),
+ (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_reduce_ph (__m256h __A, __mmask16 __B, __m256h __C, int __D)
+{
+ return __builtin_ia32_reduceph256_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_reduce_ph (__mmask16 __A, __m256h __B, int __C)
+{
+ return __builtin_ia32_reduceph256_mask (__B, __C,
+ _mm256_setzero_ph (),
+ __A);
+}
+
+#else
+#define _mm_reduce_ph(A, B) \
+ (__builtin_ia32_reduceph128_mask ((A), (B), \
+ _mm_setzero_ph (), \
+ ((__mmask8)-1)))
+
+#define _mm_mask_reduce_ph(A, B, C, D) \
+ (__builtin_ia32_reduceph128_mask ((C), (D), (A), (B)))
+
+#define _mm_maskz_reduce_ph(A, B, C) \
+ (__builtin_ia32_reduceph128_mask ((B), (C), _mm_setzero_ph (), (A)))
+
+#define _mm256_reduce_ph(A, B) \
+ (__builtin_ia32_reduceph256_mask ((A), (B), \
+ _mm256_setzero_ph (), \
+ ((__mmask16)-1)))
+
+#define _mm256_mask_reduce_ph(A, B, C, D) \
+ (__builtin_ia32_reduceph256_mask ((C), (D), (A), (B)))
+
+#define _mm256_maskz_reduce_ph(A, B, C) \
+ (__builtin_ia32_reduceph256_mask ((B), (C), _mm256_setzero_ph (), (A)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vrndscaleph. */
+#ifdef __OPTIMIZE__
+ extern __inline __m128h
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+ _mm_roundscale_ph (__m128h __A, int __B)
+ {
+ return __builtin_ia32_rndscaleph128_mask (__A, __B,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+ }
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_roundscale_ph (__m128h __A, __mmask8 __B, __m128h __C, int __D)
+{
+ return __builtin_ia32_rndscaleph128_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_roundscale_ph (__mmask8 __A, __m128h __B, int __C)
+{
+ return __builtin_ia32_rndscaleph128_mask (__B, __C,
+ _mm_setzero_ph (), __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_roundscale_ph (__m256h __A, int __B)
+{
+ return __builtin_ia32_rndscaleph256_mask (__A, __B,
+ _mm256_setzero_ph (),
+ (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_roundscale_ph (__m256h __A, __mmask16 __B, __m256h __C,
+ int __D)
+{
+ return __builtin_ia32_rndscaleph256_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_roundscale_ph (__mmask16 __A, __m256h __B, int __C)
+{
+ return __builtin_ia32_rndscaleph256_mask (__B, __C,
+ _mm256_setzero_ph (),
+ __A);
+}
+
+#else
+#define _mm_roundscale_ph(A, B) \
+ (__builtin_ia32_rndscaleph128_mask ((A), (B), _mm_setzero_ph (), \
+ ((__mmask8)-1)))
+
+#define _mm_mask_roundscale_ph(A, B, C, D) \
+ (__builtin_ia32_rndscaleph128_mask ((C), (D), (A), (B)))
+
+#define _mm_maskz_roundscale_ph(A, B, C) \
+ (__builtin_ia32_rndscaleph128_mask ((B), (C), _mm_setzero_ph (), (A)))
+
+#define _mm256_roundscale_ph(A, B) \
+ (__builtin_ia32_rndscaleph256_mask ((A), (B), \
+ _mm256_setzero_ph(), \
+ ((__mmask16)-1)))
+
+#define _mm256_mask_roundscale_ph(A, B, C, D) \
+ (__builtin_ia32_rndscaleph256_mask ((C), (D), (A), (B)))
+
+#define _mm256_maskz_roundscale_ph(A, B, C) \
+ (__builtin_ia32_rndscaleph256_mask ((B), (C), \
+ _mm256_setzero_ph (), (A)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vfpclassph. */
+#ifdef __OPTIMIZE__
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+ _mm_mask_fpclass_ph_mask (__mmask8 __U, __m128h __A, const int __imm)
+{
+ return (__mmask8) __builtin_ia32_fpclassph128_mask ((__v8hf) __A,
+ __imm, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fpclass_ph_mask (__m128h __A, const int __imm)
+{
+ return (__mmask8) __builtin_ia32_fpclassph128_mask ((__v8hf) __A,
+ __imm,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fpclass_ph_mask (__mmask16 __U, __m256h __A, const int __imm)
+{
+ return (__mmask16) __builtin_ia32_fpclassph256_mask ((__v16hf) __A,
+ __imm, __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fpclass_ph_mask (__m256h __A, const int __imm)
+{
+ return (__mmask16) __builtin_ia32_fpclassph256_mask ((__v16hf) __A,
+ __imm,
+ (__mmask16) -1);
+}
+
+#else
+#define _mm_fpclass_ph_mask(X, C) \
+ ((__mmask8) __builtin_ia32_fpclassph128_mask ((__v8hf) (__m128h) (X), \
+ (int) (C),(__mmask8)-1))
+
+#define _mm_mask_fpclass_ph_mask(u, X, C) \
+ ((__mmask8) __builtin_ia32_fpclassph128_mask ((__v8hf) (__m128h) (X), \
+ (int) (C),(__mmask8)(u)))
+
+#define _mm256_fpclass_ph_mask(X, C) \
+ ((__mmask16) __builtin_ia32_fpclassph256_mask ((__v16hf) (__m256h) (X), \
+ (int) (C),(__mmask16)-1))
+
+#define _mm256_mask_fpclass_ph_mask(u, X, C) \
+ ((__mmask16) __builtin_ia32_fpclassph256_mask ((__v16hf) (__m256h) (X), \
+ (int) (C),(__mmask16)(u)))
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vgetexpph, vgetexpsh. */
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_getexp_ph (__m256h __A)
+{
+ return (__m256h) __builtin_ia32_getexpph256_mask ((__v16hf) __A,
+ (__v16hf)
+ _mm256_setzero_ph (),
+ (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_getexp_ph (__m256h __W, __mmask16 __U, __m256h __A)
+{
+ return (__m256h) __builtin_ia32_getexpph256_mask ((__v16hf) __A,
+ (__v16hf) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_getexp_ph (__mmask16 __U, __m256h __A)
+{
+ return (__m256h) __builtin_ia32_getexpph256_mask ((__v16hf) __A,
+ (__v16hf)
+ _mm256_setzero_ph (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getexp_ph (__m128h __A)
+{
+ return (__m128h) __builtin_ia32_getexpph128_mask ((__v8hf) __A,
+ (__v8hf)
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getexp_ph (__m128h __W, __mmask8 __U, __m128h __A)
+{
+ return (__m128h) __builtin_ia32_getexpph128_mask ((__v8hf) __A,
+ (__v8hf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getexp_ph (__mmask8 __U, __m128h __A)
+{
+ return (__m128h) __builtin_ia32_getexpph128_mask ((__v8hf) __A,
+ (__v8hf)
+ _mm_setzero_ph (),
+ (__mmask8) __U);
+}
+
+
+/* Intrinsics vgetmantph, vgetmantsh. */
+#ifdef __OPTIMIZE__
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_getmant_ph (__m256h __A, _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m256h) __builtin_ia32_getmantph256_mask ((__v16hf) __A,
+ (__C << 2) | __B,
+ (__v16hf)
+ _mm256_setzero_ph (),
+ (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_getmant_ph (__m256h __W, __mmask16 __U, __m256h __A,
+ _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m256h) __builtin_ia32_getmantph256_mask ((__v16hf) __A,
+ (__C << 2) | __B,
+ (__v16hf) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_getmant_ph (__mmask16 __U, __m256h __A,
+ _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m256h) __builtin_ia32_getmantph256_mask ((__v16hf) __A,
+ (__C << 2) | __B,
+ (__v16hf)
+ _mm256_setzero_ph (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getmant_ph (__m128h __A, _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m128h) __builtin_ia32_getmantph128_mask ((__v8hf) __A,
+ (__C << 2) | __B,
+ (__v8hf)
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getmant_ph (__m128h __W, __mmask8 __U, __m128h __A,
+ _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m128h) __builtin_ia32_getmantph128_mask ((__v8hf) __A,
+ (__C << 2) | __B,
+ (__v8hf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getmant_ph (__mmask8 __U, __m128h __A,
+ _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m128h) __builtin_ia32_getmantph128_mask ((__v8hf) __A,
+ (__C << 2) | __B,
+ (__v8hf)
+ _mm_setzero_ph (),
+ (__mmask8) __U);
+}
+
+#else
+#define _mm256_getmant_ph(X, B, C) \
+ ((__m256h) __builtin_ia32_getmantph256_mask ((__v16hf)(__m256h) (X), \
+ (int)(((C)<<2) | (B)), \
+ (__v16hf)(__m256h)_mm256_setzero_ph (), \
+ (__mmask16)-1))
+
+#define _mm256_mask_getmant_ph(W, U, X, B, C) \
+ ((__m256h) __builtin_ia32_getmantph256_mask ((__v16hf)(__m256h) (X), \
+ (int)(((C)<<2) | (B)), \
+ (__v16hf)(__m256h)(W), \
+ (__mmask16)(U)))
+
+#define _mm256_maskz_getmant_ph(U, X, B, C) \
+ ((__m256h) __builtin_ia32_getmantph256_mask ((__v16hf)(__m256h) (X), \
+ (int)(((C)<<2) | (B)), \
+ (__v16hf)(__m256h)_mm256_setzero_ph (), \
+ (__mmask16)(U)))
+
+#define _mm_getmant_ph(X, B, C) \
+ ((__m128h) __builtin_ia32_getmantph128_mask ((__v8hf)(__m128h) (X), \
+ (int)(((C)<<2) | (B)), \
+ (__v8hf)(__m128h)_mm_setzero_ph (), \
+ (__mmask8)-1))
+
+#define _mm_mask_getmant_ph(W, U, X, B, C) \
+ ((__m128h) __builtin_ia32_getmantph128_mask ((__v8hf)(__m128h) (X), \
+ (int)(((C)<<2) | (B)), \
+ (__v8hf)(__m128h)(W), \
+ (__mmask8)(U)))
+
+#define _mm_maskz_getmant_ph(U, X, B, C) \
+ ((__m128h) __builtin_ia32_getmantph128_mask ((__v8hf)(__m128h) (X), \
+ (int)(((C)<<2) | (B)), \
+ (__v8hf)(__m128h)_mm_setzero_ph (), \
+ (__mmask8)(U)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtph2dq. */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtph_epi32 (__m128h __A)
+{
+ return (__m128i)
+ __builtin_ia32_vcvtph2dq128_mask (__A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtph_epi32 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+ return (__m128i)
+ __builtin_ia32_vcvtph2dq128_mask (__C, ( __v4si) __A, __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtph_epi32 (__mmask8 __A, __m128h __B)
+{
+ return (__m128i)
+ __builtin_ia32_vcvtph2dq128_mask (__B,
+ (__v4si) _mm_setzero_si128 (),
+ __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtph_epi32 (__m128h __A)
+{
+ return (__m256i)
+ __builtin_ia32_vcvtph2dq256_mask (__A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtph_epi32 (__m256i __A, __mmask8 __B, __m128h __C)
+{
+ return (__m256i)
+ __builtin_ia32_vcvtph2dq256_mask (__C, ( __v8si) __A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtph_epi32 (__mmask8 __A, __m128h __B)
+{
+ return (__m256i)
+ __builtin_ia32_vcvtph2dq256_mask (__B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ __A);
+}
+
+/* Intrinsics vcvtph2udq. */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtph_epu32 (__m128h __A)
+{
+ return (__m128i)
+ __builtin_ia32_vcvtph2udq128_mask (__A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtph_epu32 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+ return (__m128i)
+ __builtin_ia32_vcvtph2udq128_mask (__C, ( __v4si) __A, __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtph_epu32 (__mmask8 __A, __m128h __B)
+{
+ return (__m128i)
+ __builtin_ia32_vcvtph2udq128_mask (__B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtph_epu32 (__m128h __A)
+{
+ return (__m256i)
+ __builtin_ia32_vcvtph2udq256_mask (__A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtph_epu32 (__m256i __A, __mmask8 __B, __m128h __C)
+{
+ return (__m256i)
+ __builtin_ia32_vcvtph2udq256_mask (__C, ( __v8si) __A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtph_epu32 (__mmask8 __A, __m128h __B)
+{
+ return (__m256i)
+ __builtin_ia32_vcvtph2udq256_mask (__B,
+ (__v8si) _mm256_setzero_si256 (),
+ __A);
+}
+
+/* Intrinsics vcvttph2dq. */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttph_epi32 (__m128h __A)
+{
+ return (__m128i)
+ __builtin_ia32_vcvttph2dq128_mask (__A,
+ (__v4si) _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttph_epi32 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+ return (__m128i)__builtin_ia32_vcvttph2dq128_mask (__C,
+ ( __v4si) __A,
+ __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttph_epi32 (__mmask8 __A, __m128h __B)
+{
+ return (__m128i)
+ __builtin_ia32_vcvttph2dq128_mask (__B,
+ (__v4si) _mm_setzero_si128 (),
+ __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttph_epi32 (__m128h __A)
+{
+ return (__m256i)
+ __builtin_ia32_vcvttph2dq256_mask (__A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttph_epi32 (__m256i __A, __mmask8 __B, __m128h __C)
+{
+ return (__m256i)
+ __builtin_ia32_vcvttph2dq256_mask (__C,
+ ( __v8si) __A,
+ __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttph_epi32 (__mmask8 __A, __m128h __B)
+{
+ return (__m256i)
+ __builtin_ia32_vcvttph2dq256_mask (__B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ __A);
+}
+
+/* Intrinsics vcvttph2udq. */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttph_epu32 (__m128h __A)
+{
+ return (__m128i)
+ __builtin_ia32_vcvttph2udq128_mask (__A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttph_epu32 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+ return (__m128i)
+ __builtin_ia32_vcvttph2udq128_mask (__C,
+ ( __v4si) __A,
+ __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttph_epu32 (__mmask8 __A, __m128h __B)
+{
+ return (__m128i)
+ __builtin_ia32_vcvttph2udq128_mask (__B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttph_epu32 (__m128h __A)
+{
+ return (__m256i)
+ __builtin_ia32_vcvttph2udq256_mask (__A,
+ (__v8si)
+ _mm256_setzero_si256 (), (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttph_epu32 (__m256i __A, __mmask8 __B, __m128h __C)
+{
+ return (__m256i)
+ __builtin_ia32_vcvttph2udq256_mask (__C,
+ ( __v8si) __A,
+ __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttph_epu32 (__mmask8 __A, __m128h __B)
+{
+ return (__m256i)
+ __builtin_ia32_vcvttph2udq256_mask (__B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ __A);
+}
+
+/* Intrinsics vcvtdq2ph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi32_ph (__m128i __A)
+{
+ return __builtin_ia32_vcvtdq2ph128_mask ((__v4si) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi32_ph (__m128h __A, __mmask8 __B, __m128i __C)
+{
+ return __builtin_ia32_vcvtdq2ph128_mask ((__v4si) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi32_ph (__mmask8 __A, __m128i __B)
+{
+ return __builtin_ia32_vcvtdq2ph128_mask ((__v4si) __B,
+ _mm_setzero_ph (),
+ __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi32_ph (__m256i __A)
+{
+ return __builtin_ia32_vcvtdq2ph256_mask ((__v8si) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi32_ph (__m128h __A, __mmask8 __B, __m256i __C)
+{
+ return __builtin_ia32_vcvtdq2ph256_mask ((__v8si) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi32_ph (__mmask8 __A, __m256i __B)
+{
+ return __builtin_ia32_vcvtdq2ph256_mask ((__v8si) __B,
+ _mm_setzero_ph (),
+ __A);
+}
+
+/* Intrinsics vcvtudq2ph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu32_ph (__m128i __A)
+{
+ return __builtin_ia32_vcvtudq2ph128_mask ((__v4si) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepu32_ph (__m128h __A, __mmask8 __B, __m128i __C)
+{
+ return __builtin_ia32_vcvtudq2ph128_mask ((__v4si) __C,
+ __A,
+ __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepu32_ph (__mmask8 __A, __m128i __B)
+{
+ return __builtin_ia32_vcvtudq2ph128_mask ((__v4si) __B,
+ _mm_setzero_ph (),
+ __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu32_ph (__m256i __A)
+{
+ return __builtin_ia32_vcvtudq2ph256_mask ((__v8si) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepu32_ph (__m128h __A, __mmask8 __B, __m256i __C)
+{
+ return __builtin_ia32_vcvtudq2ph256_mask ((__v8si) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepu32_ph (__mmask8 __A, __m256i __B)
+{
+ return __builtin_ia32_vcvtudq2ph256_mask ((__v8si) __B,
+ _mm_setzero_ph (),
+ __A);
+}
+
+/* Intrinsics vcvtph2qq. */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtph_epi64 (__m128h __A)
+{
+ return
+ __builtin_ia32_vcvtph2qq128_mask (__A,
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtph_epi64 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvtph2qq128_mask (__C, __A, __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtph_epi64 (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvtph2qq128_mask (__B,
+ _mm_setzero_si128 (),
+ __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtph_epi64 (__m128h __A)
+{
+ return __builtin_ia32_vcvtph2qq256_mask (__A,
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtph_epi64 (__m256i __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvtph2qq256_mask (__C, __A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtph_epi64 (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvtph2qq256_mask (__B,
+ _mm256_setzero_si256 (),
+ __A);
+}
+
+/* Intrinsics vcvtph2uqq. */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtph_epu64 (__m128h __A)
+{
+ return __builtin_ia32_vcvtph2uqq128_mask (__A,
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtph_epu64 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvtph2uqq128_mask (__C, __A, __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtph_epu64 (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvtph2uqq128_mask (__B,
+ _mm_setzero_si128 (),
+ __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtph_epu64 (__m128h __A)
+{
+ return __builtin_ia32_vcvtph2uqq256_mask (__A,
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtph_epu64 (__m256i __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvtph2uqq256_mask (__C, __A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtph_epu64 (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvtph2uqq256_mask (__B,
+ _mm256_setzero_si256 (),
+ __A);
+}
+
+/* Intrinsics vcvttph2qq. */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttph_epi64 (__m128h __A)
+{
+ return __builtin_ia32_vcvttph2qq128_mask (__A,
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttph_epi64 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvttph2qq128_mask (__C,
+ __A,
+ __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttph_epi64 (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvttph2qq128_mask (__B,
+ _mm_setzero_si128 (),
+ __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttph_epi64 (__m128h __A)
+{
+ return __builtin_ia32_vcvttph2qq256_mask (__A,
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttph_epi64 (__m256i __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvttph2qq256_mask (__C,
+ __A,
+ __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttph_epi64 (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvttph2qq256_mask (__B,
+ _mm256_setzero_si256 (),
+ __A);
+}
+
+/* Intrinsics vcvttph2uqq. */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttph_epu64 (__m128h __A)
+{
+ return __builtin_ia32_vcvttph2uqq128_mask (__A,
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttph_epu64 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvttph2uqq128_mask (__C,
+ __A,
+ __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttph_epu64 (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvttph2uqq128_mask (__B,
+ _mm_setzero_si128 (),
+ __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttph_epu64 (__m128h __A)
+{
+ return __builtin_ia32_vcvttph2uqq256_mask (__A,
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttph_epu64 (__m256i __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvttph2uqq256_mask (__C,
+ __A,
+ __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttph_epu64 (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvttph2uqq256_mask (__B,
+ _mm256_setzero_si256 (),
+ __A);
+}
+
+/* Intrinsics vcvtqq2ph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi64_ph (__m128i __A)
+{
+ return __builtin_ia32_vcvtqq2ph128_mask ((__v2di) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi64_ph (__m128h __A, __mmask8 __B, __m128i __C)
+{
+ return __builtin_ia32_vcvtqq2ph128_mask ((__v2di) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi64_ph (__mmask8 __A, __m128i __B)
+{
+ return __builtin_ia32_vcvtqq2ph128_mask ((__v2di) __B,
+ _mm_setzero_ph (),
+ __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi64_ph (__m256i __A)
+{
+ return __builtin_ia32_vcvtqq2ph256_mask ((__v4di) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi64_ph (__m128h __A, __mmask8 __B, __m256i __C)
+{
+ return __builtin_ia32_vcvtqq2ph256_mask ((__v4di) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi64_ph (__mmask8 __A, __m256i __B)
+{
+ return __builtin_ia32_vcvtqq2ph256_mask ((__v4di) __B,
+ _mm_setzero_ph (),
+ __A);
+}
+
+/* Intrinsics vcvtuqq2ph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu64_ph (__m128i __A)
+{
+ return __builtin_ia32_vcvtuqq2ph128_mask ((__v2di) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepu64_ph (__m128h __A, __mmask8 __B, __m128i __C)
+{
+ return __builtin_ia32_vcvtuqq2ph128_mask ((__v2di) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepu64_ph (__mmask8 __A, __m128i __B)
+{
+ return __builtin_ia32_vcvtuqq2ph128_mask ((__v2di) __B,
+ _mm_setzero_ph (),
+ __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu64_ph (__m256i __A)
+{
+ return __builtin_ia32_vcvtuqq2ph256_mask ((__v4di) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepu64_ph (__m128h __A, __mmask8 __B, __m256i __C)
+{
+ return __builtin_ia32_vcvtuqq2ph256_mask ((__v4di) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepu64_ph (__mmask8 __A, __m256i __B)
+{
+ return __builtin_ia32_vcvtuqq2ph256_mask ((__v4di) __B,
+ _mm_setzero_ph (),
+ __A);
+}
+
+/* Intrinsics vcvtph2w. */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtph_epi16 (__m128h __A)
+{
+ return (__m128i)
+ __builtin_ia32_vcvtph2w128_mask (__A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtph_epi16 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+ return (__m128i)
+ __builtin_ia32_vcvtph2w128_mask (__C, ( __v8hi) __A, __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtph_epi16 (__mmask8 __A, __m128h __B)
+{
+ return (__m128i)
+ __builtin_ia32_vcvtph2w128_mask (__B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtph_epi16 (__m256h __A)
+{
+ return (__m256i)
+ __builtin_ia32_vcvtph2w256_mask (__A,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtph_epi16 (__m256i __A, __mmask16 __B, __m256h __C)
+{
+ return (__m256i)
+ __builtin_ia32_vcvtph2w256_mask (__C, ( __v16hi) __A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtph_epi16 (__mmask16 __A, __m256h __B)
+{
+ return (__m256i)
+ __builtin_ia32_vcvtph2w256_mask (__B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ __A);
+}
+
+/* Intrinsics vcvtph2uw. */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtph_epu16 (__m128h __A)
+{
+ return (__m128i)
+ __builtin_ia32_vcvtph2uw128_mask (__A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtph_epu16 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+ return (__m128i)
+ __builtin_ia32_vcvtph2uw128_mask (__C, ( __v8hi) __A, __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtph_epu16 (__mmask8 __A, __m128h __B)
+{
+ return (__m128i)
+ __builtin_ia32_vcvtph2uw128_mask (__B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtph_epu16 (__m256h __A)
+{
+ return (__m256i)
+ __builtin_ia32_vcvtph2uw256_mask (__A,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtph_epu16 (__m256i __A, __mmask16 __B, __m256h __C)
+{
+ return (__m256i)
+ __builtin_ia32_vcvtph2uw256_mask (__C, ( __v16hi) __A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtph_epu16 (__mmask16 __A, __m256h __B)
+{
+ return (__m256i)
+ __builtin_ia32_vcvtph2uw256_mask (__B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ __A);
+}
+
+/* Intrinsics vcvttph2w. */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttph_epi16 (__m128h __A)
+{
+ return (__m128i)
+ __builtin_ia32_vcvttph2w128_mask (__A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttph_epi16 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+ return (__m128i)
+ __builtin_ia32_vcvttph2w128_mask (__C,
+ ( __v8hi) __A,
+ __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttph_epi16 (__mmask8 __A, __m128h __B)
+{
+ return (__m128i)
+ __builtin_ia32_vcvttph2w128_mask (__B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttph_epi16 (__m256h __A)
+{
+ return (__m256i)
+ __builtin_ia32_vcvttph2w256_mask (__A,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttph_epi16 (__m256i __A, __mmask16 __B, __m256h __C)
+{
+ return (__m256i)
+ __builtin_ia32_vcvttph2w256_mask (__C,
+ ( __v16hi) __A,
+ __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttph_epi16 (__mmask16 __A, __m256h __B)
+{
+ return (__m256i)
+ __builtin_ia32_vcvttph2w256_mask (__B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ __A);
+}
+
+/* Intrinsics vcvttph2uw. */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttph_epu16 (__m128h __A)
+{
+ return (__m128i)
+ __builtin_ia32_vcvttph2uw128_mask (__A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttph_epu16 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+ return (__m128i)
+ __builtin_ia32_vcvttph2uw128_mask (__C,
+ ( __v8hi) __A,
+ __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttph_epu16 (__mmask8 __A, __m128h __B)
+{
+ return (__m128i)
+ __builtin_ia32_vcvttph2uw128_mask (__B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttph_epu16 (__m256h __A)
+{
+ return (__m256i)
+ __builtin_ia32_vcvttph2uw256_mask (__A,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttph_epu16 (__m256i __A, __mmask16 __B, __m256h __C)
+{
+ return (__m256i)
+ __builtin_ia32_vcvttph2uw256_mask (__C,
+ ( __v16hi) __A,
+ __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttph_epu16 (__mmask16 __A, __m256h __B)
+{
+ return (__m256i)
+ __builtin_ia32_vcvttph2uw256_mask (__B,
+ (__v16hi) _mm256_setzero_si256 (),
+ __A);
+}
+
+/* Intrinsics vcvtw2ph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi16_ph (__m128i __A)
+{
+ return __builtin_ia32_vcvtw2ph128_mask ((__v8hi) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi16_ph (__m128h __A, __mmask8 __B, __m128i __C)
+{
+ return __builtin_ia32_vcvtw2ph128_mask ((__v8hi) __C,
+ __A,
+ __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi16_ph (__mmask8 __A, __m128i __B)
+{
+ return __builtin_ia32_vcvtw2ph128_mask ((__v8hi) __B,
+ _mm_setzero_ph (),
+ __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi16_ph (__m256i __A)
+{
+ return __builtin_ia32_vcvtw2ph256_mask ((__v16hi) __A,
+ _mm256_setzero_ph (),
+ (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi16_ph (__m256h __A, __mmask16 __B, __m256i __C)
+{
+ return __builtin_ia32_vcvtw2ph256_mask ((__v16hi) __C,
+ __A,
+ __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi16_ph (__mmask16 __A, __m256i __B)
+{
+ return __builtin_ia32_vcvtw2ph256_mask ((__v16hi) __B,
+ _mm256_setzero_ph (),
+ __A);
+}
+
+/* Intrinsics vcvtuw2ph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu16_ph (__m128i __A)
+{
+ return __builtin_ia32_vcvtuw2ph128_mask ((__v8hi) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepu16_ph (__m128h __A, __mmask8 __B, __m128i __C)
+{
+ return __builtin_ia32_vcvtuw2ph128_mask ((__v8hi) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepu16_ph (__mmask8 __A, __m128i __B)
+{
+ return __builtin_ia32_vcvtuw2ph128_mask ((__v8hi) __B,
+ _mm_setzero_ph (),
+ __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu16_ph (__m256i __A)
+{
+ return __builtin_ia32_vcvtuw2ph256_mask ((__v16hi) __A,
+ _mm256_setzero_ph (),
+ (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepu16_ph (__m256h __A, __mmask16 __B, __m256i __C)
+{
+ return __builtin_ia32_vcvtuw2ph256_mask ((__v16hi) __C, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepu16_ph (__mmask16 __A, __m256i __B)
+{
+ return __builtin_ia32_vcvtuw2ph256_mask ((__v16hi) __B,
+ _mm256_setzero_ph (),
+ __A);
+}
+
+/* Intrinsics vcvtph2pd. */
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtph_pd (__m128h __A)
+{
+ return __builtin_ia32_vcvtph2pd128_mask (__A,
+ _mm_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtph_pd (__m128d __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvtph2pd128_mask (__C, __A, __B);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtph_pd (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvtph2pd128_mask (__B, _mm_setzero_pd (), __A);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtph_pd (__m128h __A)
+{
+ return __builtin_ia32_vcvtph2pd256_mask (__A,
+ _mm256_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtph_pd (__m256d __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvtph2pd256_mask (__C, __A, __B);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtph_pd (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvtph2pd256_mask (__B,
+ _mm256_setzero_pd (),
+ __A);
+}
+
+/* Intrinsics vcvtph2ps. */
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtxph_ps (__m128h __A)
+{
+ return __builtin_ia32_vcvtph2psx128_mask (__A,
+ _mm_setzero_ps (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtxph_ps (__m128 __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvtph2psx128_mask (__C, __A, __B);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtxph_ps (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvtph2psx128_mask (__B, _mm_setzero_ps (), __A);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtxph_ps (__m128h __A)
+{
+ return __builtin_ia32_vcvtph2psx256_mask (__A,
+ _mm256_setzero_ps (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtxph_ps (__m256 __A, __mmask8 __B, __m128h __C)
+{
+ return __builtin_ia32_vcvtph2psx256_mask (__C, __A, __B);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtxph_ps (__mmask8 __A, __m128h __B)
+{
+ return __builtin_ia32_vcvtph2psx256_mask (__B,
+ _mm256_setzero_ps (),
+ __A);
+}
+
+/* Intrinsics vcvtxps2ph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtxps_ph (__m128 __A)
+{
+ return __builtin_ia32_vcvtps2phx128_mask ((__v4sf) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtxps_ph (__m128h __A, __mmask8 __B, __m128 __C)
+{
+ return __builtin_ia32_vcvtps2phx128_mask ((__v4sf) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtxps_ph (__mmask8 __A, __m128 __B)
+{
+ return __builtin_ia32_vcvtps2phx128_mask ((__v4sf) __B,
+ _mm_setzero_ph (),
+ __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtxps_ph (__m256 __A)
+{
+ return __builtin_ia32_vcvtps2phx256_mask ((__v8sf) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtxps_ph (__m128h __A, __mmask8 __B, __m256 __C)
+{
+ return __builtin_ia32_vcvtps2phx256_mask ((__v8sf) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtxps_ph (__mmask8 __A, __m256 __B)
+{
+ return __builtin_ia32_vcvtps2phx256_mask ((__v8sf) __B,
+ _mm_setzero_ph (),
+ __A);
+}
+
+/* Intrinsics vcvtpd2ph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpd_ph (__m128d __A)
+{
+ return __builtin_ia32_vcvtpd2ph128_mask ((__v2df) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtpd_ph (__m128h __A, __mmask8 __B, __m128d __C)
+{
+ return __builtin_ia32_vcvtpd2ph128_mask ((__v2df) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtpd_ph (__mmask8 __A, __m128d __B)
+{
+ return __builtin_ia32_vcvtpd2ph128_mask ((__v2df) __B,
+ _mm_setzero_ph (),
+ __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtpd_ph (__m256d __A)
+{
+ return __builtin_ia32_vcvtpd2ph256_mask ((__v4df) __A,
+ _mm_setzero_ph (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtpd_ph (__m128h __A, __mmask8 __B, __m256d __C)
+{
+ return __builtin_ia32_vcvtpd2ph256_mask ((__v4df) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtpd_ph (__mmask8 __A, __m256d __B)
+{
+ return __builtin_ia32_vcvtpd2ph256_mask ((__v4df) __B,
+ _mm_setzero_ph (),
+ __A);
+}
+
+/* Intrinsics vfmaddsub[132,213,231]ph. */
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmaddsub_ph (__m256h __A, __m256h __B, __m256h __C)
+{
+ return (__m256h)__builtin_ia32_vfmaddsubph256_mask ((__v16hf)__A,
+ (__v16hf)__B,
+ (__v16hf)__C,
+ (__mmask16)-1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmaddsub_ph (__m256h __A, __mmask16 __U, __m256h __B,
+ __m256h __C)
+{
+ return (__m256h) __builtin_ia32_vfmaddsubph256_mask ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __C,
+ (__mmask16) __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmaddsub_ph (__m256h __A, __m256h __B, __m256h __C,
+ __mmask16 __U)
+{
+ return (__m256h) __builtin_ia32_vfmaddsubph256_mask3 ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __C,
+ (__mmask16)
+ __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmaddsub_ph (__mmask16 __U, __m256h __A, __m256h __B,
+ __m256h __C)
+{
+ return (__m256h) __builtin_ia32_vfmaddsubph256_maskz ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __C,
+ (__mmask16)
+ __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmaddsub_ph (__m128h __A, __m128h __B, __m128h __C)
+{
+ return (__m128h)__builtin_ia32_vfmaddsubph128_mask ((__v8hf)__A,
+ (__v8hf)__B,
+ (__v8hf)__C,
+ (__mmask8)-1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmaddsub_ph (__m128h __A, __mmask8 __U, __m128h __B,
+ __m128h __C)
+{
+ return (__m128h) __builtin_ia32_vfmaddsubph128_mask ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmaddsub_ph (__m128h __A, __m128h __B, __m128h __C,
+ __mmask8 __U)
+{
+ return (__m128h) __builtin_ia32_vfmaddsubph128_mask3 ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmaddsub_ph (__mmask8 __U, __m128h __A, __m128h __B,
+ __m128h __C)
+{
+ return (__m128h) __builtin_ia32_vfmaddsubph128_maskz ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ (__mmask8)
+ __U);
+}
+
+/* Intrinsics vfmsubadd[132,213,231]ph. */
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmsubadd_ph (__m256h __A, __m256h __B, __m256h __C)
+{
+ return (__m256h) __builtin_ia32_vfmsubaddph256_mask ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __C,
+ (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmsubadd_ph (__m256h __A, __mmask16 __U, __m256h __B,
+ __m256h __C)
+{
+ return (__m256h) __builtin_ia32_vfmsubaddph256_mask ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __C,
+ (__mmask16) __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmsubadd_ph (__m256h __A, __m256h __B, __m256h __C,
+ __mmask16 __U)
+{
+ return (__m256h) __builtin_ia32_vfmsubaddph256_mask3 ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __C,
+ (__mmask16)
+ __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmsubadd_ph (__mmask16 __U, __m256h __A, __m256h __B,
+ __m256h __C)
+{
+ return (__m256h) __builtin_ia32_vfmsubaddph256_maskz ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __C,
+ (__mmask16)
+ __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmsubadd_ph (__m128h __A, __m128h __B, __m128h __C)
+{
+ return (__m128h) __builtin_ia32_vfmsubaddph128_mask ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmsubadd_ph (__m128h __A, __mmask8 __U, __m128h __B,
+ __m128h __C)
+{
+ return (__m128h) __builtin_ia32_vfmsubaddph128_mask ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmsubadd_ph (__m128h __A, __m128h __B, __m128h __C,
+ __mmask8 __U)
+{
+ return (__m128h) __builtin_ia32_vfmsubaddph128_mask3 ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmsubadd_ph (__mmask8 __U, __m128h __A, __m128h __B,
+ __m128h __C)
+{
+ return (__m128h) __builtin_ia32_vfmsubaddph128_maskz ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ (__mmask8)
+ __U);
+}
+
+/* Intrinsics vfmadd[132,213,231]ph. */
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmadd_ph (__m256h __A, __m256h __B, __m256h __C)
+{
+ return (__m256h) __builtin_ia32_vfmaddph256_mask ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __C,
+ (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmadd_ph (__m256h __A, __mmask16 __U, __m256h __B,
+ __m256h __C)
+{
+ return (__m256h) __builtin_ia32_vfmaddph256_mask ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __C,
+ (__mmask16) __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmadd_ph (__m256h __A, __m256h __B, __m256h __C,
+ __mmask16 __U)
+{
+ return (__m256h) __builtin_ia32_vfmaddph256_mask3 ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __C,
+ (__mmask16)
+ __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmadd_ph (__mmask16 __U, __m256h __A, __m256h __B,
+ __m256h __C)
+{
+ return (__m256h) __builtin_ia32_vfmaddph256_maskz ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __C,
+ (__mmask16)
+ __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_ph (__m128h __A, __m128h __B, __m128h __C)
+{
+ return (__m128h) __builtin_ia32_vfmaddph128_mask ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmadd_ph (__m128h __A, __mmask8 __U, __m128h __B,
+ __m128h __C)
+{
+ return (__m128h) __builtin_ia32_vfmaddph128_mask ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmadd_ph (__m128h __A, __m128h __B, __m128h __C,
+ __mmask8 __U)
+{
+ return (__m128h) __builtin_ia32_vfmaddph128_mask3 ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmadd_ph (__mmask8 __U, __m128h __A, __m128h __B,
+ __m128h __C)
+{
+ return (__m128h) __builtin_ia32_vfmaddph128_maskz ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ (__mmask8)
+ __U);
+}
+
+/* Intrinsics vfnmadd[132,213,231]ph. */
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fnmadd_ph (__m256h __A, __m256h __B, __m256h __C)
+{
+ return (__m256h) __builtin_ia32_vfnmaddph256_mask ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __C,
+ (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fnmadd_ph (__m256h __A, __mmask16 __U, __m256h __B,
+ __m256h __C)
+{
+ return (__m256h) __builtin_ia32_vfnmaddph256_mask ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __C,
+ (__mmask16) __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fnmadd_ph (__m256h __A, __m256h __B, __m256h __C,
+ __mmask16 __U)
+{
+ return (__m256h) __builtin_ia32_vfnmaddph256_mask3 ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __C,
+ (__mmask16)
+ __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fnmadd_ph (__mmask16 __U, __m256h __A, __m256h __B,
+ __m256h __C)
+{
+ return (__m256h) __builtin_ia32_vfnmaddph256_maskz ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __C,
+ (__mmask16)
+ __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmadd_ph (__m128h __A, __m128h __B, __m128h __C)
+{
+ return (__m128h) __builtin_ia32_vfnmaddph128_mask ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmadd_ph (__m128h __A, __mmask8 __U, __m128h __B,
+ __m128h __C)
+{
+ return (__m128h) __builtin_ia32_vfnmaddph128_mask ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmadd_ph (__m128h __A, __m128h __B, __m128h __C,
+ __mmask8 __U)
+{
+ return (__m128h) __builtin_ia32_vfnmaddph128_mask3 ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmadd_ph (__mmask8 __U, __m128h __A, __m128h __B,
+ __m128h __C)
+{
+ return (__m128h) __builtin_ia32_vfnmaddph128_maskz ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ (__mmask8)
+ __U);
+}
+
+/* Intrinsics vfmsub[132,213,231]ph. */
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmsub_ph (__m256h __A, __m256h __B, __m256h __C)
+{
+ return (__m256h) __builtin_ia32_vfmsubph256_mask ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __C,
+ (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmsub_ph (__m256h __A, __mmask16 __U, __m256h __B,
+ __m256h __C)
+{
+ return (__m256h) __builtin_ia32_vfmsubph256_mask ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __C,
+ (__mmask16) __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmsub_ph (__m256h __A, __m256h __B, __m256h __C,
+ __mmask16 __U)
+{
+ return (__m256h) __builtin_ia32_vfmsubph256_mask3 ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __C,
+ (__mmask16)
+ __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmsub_ph (__mmask16 __U, __m256h __A, __m256h __B,
+ __m256h __C)
+{
+ return (__m256h) __builtin_ia32_vfmsubph256_maskz ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __C,
+ (__mmask16)
+ __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmsub_ph (__m128h __A, __m128h __B, __m128h __C)
+{
+ return (__m128h) __builtin_ia32_vfmsubph128_mask ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmsub_ph (__m128h __A, __mmask8 __U, __m128h __B,
+ __m128h __C)
+{
+ return (__m128h) __builtin_ia32_vfmsubph128_mask ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmsub_ph (__m128h __A, __m128h __B, __m128h __C,
+ __mmask8 __U)
+{
+ return (__m128h) __builtin_ia32_vfmsubph128_mask3 ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmsub_ph (__mmask8 __U, __m128h __A, __m128h __B,
+ __m128h __C)
+{
+ return (__m128h) __builtin_ia32_vfmsubph128_maskz ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ (__mmask8)
+ __U);
+}
+
+/* Intrinsics vfnmsub[132,213,231]ph. */
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fnmsub_ph (__m256h __A, __m256h __B, __m256h __C)
+{
+ return (__m256h) __builtin_ia32_vfnmsubph256_mask ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __C,
+ (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fnmsub_ph (__m256h __A, __mmask16 __U, __m256h __B,
+ __m256h __C)
+{
+ return (__m256h) __builtin_ia32_vfnmsubph256_mask ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __C,
+ (__mmask16) __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fnmsub_ph (__m256h __A, __m256h __B, __m256h __C,
+ __mmask16 __U)
+{
+ return (__m256h) __builtin_ia32_vfnmsubph256_mask3 ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __C,
+ (__mmask16)
+ __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fnmsub_ph (__mmask16 __U, __m256h __A, __m256h __B,
+ __m256h __C)
+{
+ return (__m256h) __builtin_ia32_vfnmsubph256_maskz ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __C,
+ (__mmask16)
+ __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmsub_ph (__m128h __A, __m128h __B, __m128h __C)
+{
+ return (__m128h) __builtin_ia32_vfnmsubph128_mask ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmsub_ph (__m128h __A, __mmask8 __U, __m128h __B,
+ __m128h __C)
+{
+ return (__m128h) __builtin_ia32_vfnmsubph128_mask ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmsub_ph (__m128h __A, __m128h __B, __m128h __C,
+ __mmask8 __U)
+{
+ return (__m128h) __builtin_ia32_vfnmsubph128_mask3 ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmsub_ph (__mmask8 __U, __m128h __A, __m128h __B,
+ __m128h __C)
+{
+ return (__m128h) __builtin_ia32_vfnmsubph128_maskz ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C,
+ (__mmask8)
+ __U);
+}
+
+/* Intrinsics vf[,c]maddcph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_pch (__m128h __A, __m128h __B, __m128h __C)
+{
+ return (__m128h) __builtin_ia32_vfmaddcph128 ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmadd_pch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return (__m128h)
+ __builtin_ia32_vfmaddcph128_mask ((__v8hf) __A,
+ (__v8hf) __C,
+ (__v8hf) __D, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmadd_pch (__m128h __A, __m128h __B, __m128h __C, __mmask8 __D)
+{
+ return (__m128h)
+ __builtin_ia32_vfmaddcph128_mask3 ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C, __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmadd_pch (__mmask8 __A, __m128h __B, __m128h __C, __m128h __D)
+{
+ return (__m128h) __builtin_ia32_vfmaddcph128_maskz ((__v8hf) __B,
+ (__v8hf) __C,
+ (__v8hf) __D, __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmadd_pch (__m256h __A, __m256h __B, __m256h __C)
+{
+ return (__m256h) __builtin_ia32_vfmaddcph256 ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __C);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmadd_pch (__m256h __A, __mmask8 __B, __m256h __C, __m256h __D)
+{
+ return (__m256h)
+ __builtin_ia32_vfmaddcph256_mask ((__v16hf) __A,
+ (__v16hf) __C,
+ (__v16hf) __D, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmadd_pch (__m256h __A, __m256h __B, __m256h __C, __mmask8 __D)
+{
+ return (__m256h)
+ __builtin_ia32_vfmaddcph256_mask3 ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __C, __D);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmadd_pch (__mmask8 __A, __m256h __B, __m256h __C, __m256h __D)
+{
+ return (__m256h)__builtin_ia32_vfmaddcph256_maskz ((__v16hf) __B,
+ (__v16hf) __C,
+ (__v16hf) __D, __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fcmadd_pch (__m128h __A, __m128h __B, __m128h __C)
+{
+ return (__m128h) __builtin_ia32_vfcmaddcph128 ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fcmadd_pch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return (__m128h)
+ __builtin_ia32_vfcmaddcph128_mask ((__v8hf) __A,
+ (__v8hf) __C,
+ (__v8hf) __D, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fcmadd_pch (__m128h __A, __m128h __B, __m128h __C, __mmask8 __D)
+{
+ return (__m128h)
+ __builtin_ia32_vfcmaddcph128_mask3 ((__v8hf) __A,
+ (__v8hf) __B,
+ (__v8hf) __C, __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fcmadd_pch (__mmask8 __A, __m128h __B, __m128h __C, __m128h __D)
+{
+ return (__m128h)__builtin_ia32_vfcmaddcph128_maskz ((__v8hf) __B,
+ (__v8hf) __C,
+ (__v8hf) __D, __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fcmadd_pch (__m256h __A, __m256h __B, __m256h __C)
+{
+ return (__m256h) __builtin_ia32_vfcmaddcph256 ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __C);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fcmadd_pch (__m256h __A, __mmask8 __B, __m256h __C, __m256h __D)
+{
+ return (__m256h)
+ __builtin_ia32_vfcmaddcph256_mask ((__v16hf) __A,
+ (__v16hf) __C,
+ (__v16hf) __D, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fcmadd_pch (__m256h __A, __m256h __B, __m256h __C, __mmask8 __D)
+{
+ return (__m256h)
+ __builtin_ia32_vfcmaddcph256_mask3 ((__v16hf) __A,
+ (__v16hf) __B,
+ (__v16hf) __C, __D);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fcmadd_pch (__mmask8 __A, __m256h __B, __m256h __C, __m256h __D)
+{
+ return (__m256h) __builtin_ia32_vfcmaddcph256_maskz ((__v16hf) __B,
+ (__v16hf) __C,
+ (__v16hf) __D, __A);
+}
+
+/* Intrinsics vf[,c]mulcph. */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmul_pch (__m128h __A, __m128h __B)
+{
+ return (__m128h) __builtin_ia32_vfmulcph128 ((__v8hf) __A, (__v8hf) __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmul_pch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return (__m128h) __builtin_ia32_vfmulcph128_mask ((__v8hf) __C,
+ (__v8hf) __D,
+ (__v8hf) __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmul_pch (__mmask8 __A, __m128h __B, __m128h __C)
+{
+ return (__m128h) __builtin_ia32_vfmulcph128_mask ((__v8hf) __B,
+ (__v8hf) __C,
+ _mm_setzero_ph (),
+ __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmul_pch (__m256h __A, __m256h __B)
+{
+ return (__m256h) __builtin_ia32_vfmulcph256 ((__v16hf) __A,
+ (__v16hf) __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmul_pch (__m256h __A, __mmask8 __B, __m256h __C, __m256h __D)
+{
+ return (__m256h) __builtin_ia32_vfmulcph256_mask ((__v16hf) __C,
+ (__v16hf) __D,
+ (__v16hf) __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmul_pch (__mmask8 __A, __m256h __B, __m256h __C)
+{
+ return (__m256h) __builtin_ia32_vfmulcph256_mask ((__v16hf) __B,
+ (__v16hf) __C,
+ _mm256_setzero_ph (),
+ __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fcmul_pch (__m128h __A, __m128h __B)
+{
+ return (__m128h) __builtin_ia32_vfcmulcph128 ((__v8hf) __A,
+ (__v8hf) __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fcmul_pch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+ return (__m128h) __builtin_ia32_vfcmulcph128_mask ((__v8hf) __C,
+ (__v8hf) __D,
+ (__v8hf) __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fcmul_pch (__mmask8 __A, __m128h __B, __m128h __C)
+{
+ return (__m128h) __builtin_ia32_vfcmulcph128_mask ((__v8hf) __B,
+ (__v8hf) __C,
+ _mm_setzero_ph (),
+ __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fcmul_pch (__m256h __A, __m256h __B)
+{
+ return (__m256h) __builtin_ia32_vfcmulcph256 ((__v16hf) __A, (__v16hf) __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fcmul_pch (__m256h __A, __mmask8 __B, __m256h __C, __m256h __D)
+{
+ return (__m256h) __builtin_ia32_vfcmulcph256_mask ((__v16hf) __C,
+ (__v16hf) __D,
+ (__v16hf) __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fcmul_pch (__mmask8 __A, __m256h __B, __m256h __C)
+{
+ return (__m256h) __builtin_ia32_vfcmulcph256_mask ((__v16hf) __B,
+ (__v16hf) __C,
+ _mm256_setzero_ph (),
+ __A);
+}
+
+#define _MM256_REDUCE_OP(op) \
+ __m128h __T1 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 0); \
+ __m128h __T2 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 1); \
+ __m128h __T3 = (__T1 op __T2); \
+ __m128h __T4 = (__m128h) __builtin_shuffle (__T3, \
+ (__v8hi) { 4, 5, 6, 7, 0, 1, 2, 3 }); \
+ __m128h __T5 = (__T3) op (__T4); \
+ __m128h __T6 = (__m128h) __builtin_shuffle (__T5, \
+ (__v8hi) { 2, 3, 0, 1, 4, 5, 6, 7 }); \
+ __m128h __T7 = __T5 op __T6; \
+ return __T7[0] op __T7[1]
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_add_ph (__m256h __A)
+{
+ _MM256_REDUCE_OP (+);
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_mul_ph (__m256h __A)
+{
+ _MM256_REDUCE_OP (*);
+}
+
+#undef _MM256_REDUCE_OP
+#define _MM256_REDUCE_OP(op) \
+ __m128h __T1 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 0); \
+ __m128h __T2 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 1); \
+ __m128h __T3 = _mm_##op (__T1, __T2); \
+ __m128h __T4 = (__m128h) __builtin_shuffle (__T3, \
+ (__v8hi) { 2, 3, 0, 1, 6, 7, 4, 5 }); \
+ __m128h __T5 = _mm_##op (__T3, __T4); \
+ __m128h __T6 = (__m128h) __builtin_shuffle (__T5, (__v8hi) { 4, 5 }); \
+ __m128h __T7 = _mm_##op (__T5, __T6); \
+ __m128h __T8 = (__m128h) __builtin_shuffle (__T7, (__v8hi) { 1, 0 }); \
+ __m128h __T9 = _mm_##op (__T7, __T8); \
+ return __T9[0]
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_min_ph (__m256h __A)
+{
+ _MM256_REDUCE_OP (min_ph);
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_max_ph (__m256h __A)
+{
+ _MM256_REDUCE_OP (max_ph);
+}
+
+#define _MM_REDUCE_OP(op) \
+ __m128h __T1 = (__m128h) __builtin_shuffle (__A, \
+ (__v8hi) { 4, 5, 6, 7, 0, 1, 2, 3 }); \
+ __m128h __T2 = (__A) op (__T1); \
+ __m128h __T3 = (__m128h) __builtin_shuffle (__T2, \
+ (__v8hi){ 2, 3, 0, 1, 4, 5, 6, 7 }); \
+ __m128h __T4 = __T2 op __T3; \
+ return __T4[0] op __T4[1]
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_add_ph (__m128h __A)
+{
+ _MM_REDUCE_OP (+);
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_mul_ph (__m128h __A)
+{
+ _MM_REDUCE_OP (*);
+}
+
+#undef _MM_REDUCE_OP
+#define _MM_REDUCE_OP(op) \
+ __m128h __T1 = (__m128h) __builtin_shuffle (__A, \
+ (__v8hi) { 2, 3, 0, 1, 6, 7, 4, 5 }); \
+ __m128h __T2 = _mm_##op (__A, __T1); \
+ __m128h __T3 = (__m128h) __builtin_shuffle (__T2, (__v8hi){ 4, 5 }); \
+ __m128h __T4 = _mm_##op (__T2, __T3); \
+ __m128h __T5 = (__m128h) __builtin_shuffle (__T4, (__v8hi){ 1, 0 }); \
+ __m128h __T6 = _mm_##op (__T4, __T5); \
+ return __T6[0]
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_min_ph (__m128h __A)
+{
+ _MM_REDUCE_OP (min_ph);
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_max_ph (__m128h __A)
+{
+ _MM_REDUCE_OP (max_ph);
+}
+
+#undef _MM256_REDUCE_OP
+#undef _MM_REDUCE_OP
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_blend_ph (__mmask16 __U, __m256h __A, __m256h __W)
+{
+ return (__m256h) __builtin_ia32_movdquhi256_mask ((__v16hi) __W,
+ (__v16hi) __A,
+ (__mmask16) __U);
+
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutex2var_ph (__m256h __A, __m256i __I, __m256h __B)
+{
+ return (__m256h) __builtin_ia32_vpermi2varhi256_mask ((__v16hi) __A,
+ (__v16hi) __I,
+ (__v16hi) __B,
+ (__mmask16)-1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutexvar_ph (__m256i __A, __m256h __B)
+{
+ return (__m256h) __builtin_ia32_permvarhi256_mask ((__v16hi) __B,
+ (__v16hi) __A,
+ (__v16hi)
+ (_mm256_setzero_ph ()),
+ (__mmask16)-1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_blend_ph (__mmask8 __U, __m128h __A, __m128h __W)
+{
+ return (__m128h) __builtin_ia32_movdquhi128_mask ((__v8hi) __W,
+ (__v8hi) __A,
+ (__mmask8) __U);
+
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutex2var_ph (__m128h __A, __m128i __I, __m128h __B)
+{
+ return (__m128h) __builtin_ia32_vpermi2varhi128_mask ((__v8hi) __A,
+ (__v8hi) __I,
+ (__v8hi) __B,
+ (__mmask8)-1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutexvar_ph (__m128i __A, __m128h __B)
+{
+ return (__m128h) __builtin_ia32_permvarhi128_mask ((__v8hi) __B,
+ (__v8hi) __A,
+ (__v8hi)
+ (_mm_setzero_ph ()),
+ (__mmask8)-1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_pch (_Float16 _Complex __A)
+{
+ union
+ {
+ _Float16 _Complex __a;
+ float __b;
+ } __u = { .__a = __A };
+
+ return (__m256h) _mm256_set1_ps (__u.__b);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_pch (_Float16 _Complex __A)
+{
+ union
+ {
+ _Float16 _Complex __a;
+ float __b;
+ } __u = { .__a = __A };
+
+ return (__m128h) _mm_set1_ps (__u.__b);
+}
+
+// intrinsics below are alias for f*mul_*ch
+#define _mm_mul_pch(A, B) _mm_fmul_pch ((A), (B))
+#define _mm_mask_mul_pch(W, U, A, B) _mm_mask_fmul_pch ((W), (U), (A), (B))
+#define _mm_maskz_mul_pch(U, A, B) _mm_maskz_fmul_pch ((U), (A), (B))
+#define _mm256_mul_pch(A, B) _mm256_fmul_pch ((A), (B))
+#define _mm256_mask_mul_pch(W, U, A, B) \
+ _mm256_mask_fmul_pch ((W), (U), (A), (B))
+#define _mm256_maskz_mul_pch(U, A, B) _mm256_maskz_fmul_pch ((U), (A), (B))
+
+#define _mm_cmul_pch(A, B) _mm_fcmul_pch ((A), (B))
+#define _mm_mask_cmul_pch(W, U, A, B) _mm_mask_fcmul_pch ((W), (U), (A), (B))
+#define _mm_maskz_cmul_pch(U, A, B) _mm_maskz_fcmul_pch ((U), (A), (B))
+#define _mm256_cmul_pch(A, B) _mm256_fcmul_pch ((A), (B))
+#define _mm256_mask_cmul_pch(W, U, A, B) \
+ _mm256_mask_fcmul_pch ((W), (U), (A), (B))
+#define _mm256_maskz_cmul_pch(U, A, B) _mm256_maskz_fcmul_pch((U), (A), (B))
+
+#ifdef __DISABLE_AVX512FP16VL__
+#undef __DISABLE_AVX512FP16VL__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512FP16VL__ */
+
+#endif /* __AVX512FP16VLINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2013-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512ifmaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512IFMAINTRIN_H_INCLUDED
+#define _AVX512IFMAINTRIN_H_INCLUDED
+
+#ifndef __AVX512IFMA__
+#pragma GCC push_options
+#pragma GCC target("avx512ifma")
+#define __DISABLE_AVX512IFMA__
+#endif /* __AVX512IFMA__ */
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_madd52lo_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
+{
+ return (__m512i) __builtin_ia32_vpmadd52luq512_mask ((__v8di) __X,
+ (__v8di) __Y,
+ (__v8di) __Z,
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
+{
+ return (__m512i) __builtin_ia32_vpmadd52huq512_mask ((__v8di) __X,
+ (__v8di) __Y,
+ (__v8di) __Z,
+ (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X,
+ __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_vpmadd52luq512_mask ((__v8di) __W,
+ (__v8di) __X,
+ (__v8di) __Y,
+ (__mmask8) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X,
+ __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_vpmadd52huq512_mask ((__v8di) __W,
+ (__v8di) __X,
+ (__v8di) __Y,
+ (__mmask8) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_madd52lo_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
+{
+ return (__m512i) __builtin_ia32_vpmadd52luq512_maskz ((__v8di) __X,
+ (__v8di) __Y,
+ (__v8di) __Z,
+ (__mmask8) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_madd52hi_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
+{
+ return (__m512i) __builtin_ia32_vpmadd52huq512_maskz ((__v8di) __X,
+ (__v8di) __Y,
+ (__v8di) __Z,
+ (__mmask8) __M);
+}
+
+#ifdef __DISABLE_AVX512IFMA__
+#undef __DISABLE_AVX512IFMA__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512IFMA__ */
+
+#endif /* _AVX512IFMAINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2013-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512ifmavlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512IFMAVLINTRIN_H_INCLUDED
+#define _AVX512IFMAVLINTRIN_H_INCLUDED
+
+#if !defined(__AVX512VL__) || !defined(__AVX512IFMA__)
+#pragma GCC push_options
+#pragma GCC target("avx512ifma,avx512vl")
+#define __DISABLE_AVX512IFMAVL__
+#endif /* __AVX512IFMAVL__ */
+
+#define _mm_madd52lo_epu64(A, B, C) \
+ ((__m128i) __builtin_ia32_vpmadd52luq128 ((__v2di) (A), \
+ (__v2di) (B), \
+ (__v2di) (C)))
+
+#define _mm_madd52hi_epu64(A, B, C) \
+ ((__m128i) __builtin_ia32_vpmadd52huq128 ((__v2di) (A), \
+ (__v2di) (B), \
+ (__v2di) (C)))
+
+#define _mm256_madd52lo_epu64(A, B, C) \
+ ((__m256i) __builtin_ia32_vpmadd52luq256 ((__v4di) (A), \
+ (__v4di) (B), \
+ (__v4di) (C)))
+
+
+#define _mm256_madd52hi_epu64(A, B, C) \
+ ((__m256i) __builtin_ia32_vpmadd52huq256 ((__v4di) (A), \
+ (__v4di) (B), \
+ (__v4di) (C)))
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_vpmadd52luq128_mask ((__v2di) __W,
+ (__v2di) __X,
+ (__v2di) __Y,
+ (__mmask8) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_vpmadd52huq128_mask ((__v2di) __W,
+ (__v2di) __X,
+ (__v2di) __Y,
+ (__mmask8) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X,
+ __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_vpmadd52luq256_mask ((__v4di) __W,
+ (__v4di) __X,
+ (__v4di) __Y,
+ (__mmask8) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X,
+ __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_vpmadd52huq256_mask ((__v4di) __W,
+ (__v4di) __X,
+ (__v4di) __Y,
+ (__mmask8) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
+{
+ return (__m128i) __builtin_ia32_vpmadd52luq128_maskz ((__v2di) __X,
+ (__v2di) __Y,
+ (__v2di) __Z,
+ (__mmask8) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
+{
+ return (__m128i) __builtin_ia32_vpmadd52huq128_maskz ((__v2di) __X,
+ (__v2di) __Y,
+ (__v2di) __Z,
+ (__mmask8) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_madd52lo_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
+{
+ return (__m256i) __builtin_ia32_vpmadd52luq256_maskz ((__v4di) __X,
+ (__v4di) __Y,
+ (__v4di) __Z,
+ (__mmask8) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
+{
+ return (__m256i) __builtin_ia32_vpmadd52huq256_maskz ((__v4di) __X,
+ (__v4di) __Y,
+ (__v4di) __Z,
+ (__mmask8) __M);
+}
+
+#ifdef __DISABLE_AVX512IFMAVL__
+#undef __DISABLE_AVX512IFMAVL__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512IFMAVL__ */
+
+#endif /* _AVX512IFMAVLINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2013-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512pfintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512PFINTRIN_H_INCLUDED
+#define _AVX512PFINTRIN_H_INCLUDED
+
+#ifndef __AVX512PF__
+#pragma GCC push_options
+#pragma GCC target("avx512pf")
+#define __DISABLE_AVX512PF__
+#endif /* __AVX512PF__ */
+
+/* Internal data types for implementing the intrinsics. */
+typedef long long __v8di __attribute__ ((__vector_size__ (64)));
+typedef int __v16si __attribute__ ((__vector_size__ (64)));
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+ vector types, and their scalar components. */
+typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__));
+
+typedef unsigned char __mmask8;
+typedef unsigned short __mmask16;
+
+#ifdef __OPTIMIZE__
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_prefetch_i32gather_pd (__m256i __index, void const *__addr,
+ int __scale, int __hint)
+{
+ __builtin_ia32_gatherpfdpd ((__mmask8) 0xFF, (__v8si) __index, __addr,
+ __scale, __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_prefetch_i32gather_ps (__m512i __index, void const *__addr,
+ int __scale, int __hint)
+{
+ __builtin_ia32_gatherpfdps ((__mmask16) 0xFFFF, (__v16si) __index, __addr,
+ __scale, __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_prefetch_i32gather_pd (__m256i __index, __mmask8 __mask,
+ void const *__addr, int __scale, int __hint)
+{
+ __builtin_ia32_gatherpfdpd (__mask, (__v8si) __index, __addr, __scale,
+ __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_prefetch_i32gather_ps (__m512i __index, __mmask16 __mask,
+ void const *__addr, int __scale, int __hint)
+{
+ __builtin_ia32_gatherpfdps (__mask, (__v16si) __index, __addr, __scale,
+ __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_prefetch_i64gather_pd (__m512i __index, void const *__addr,
+ int __scale, int __hint)
+{
+ __builtin_ia32_gatherpfqpd ((__mmask8) 0xFF, (__v8di) __index, __addr,
+ __scale, __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_prefetch_i64gather_ps (__m512i __index, void const *__addr,
+ int __scale, int __hint)
+{
+ __builtin_ia32_gatherpfqps ((__mmask8) 0xFF, (__v8di) __index, __addr,
+ __scale, __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_prefetch_i64gather_pd (__m512i __index, __mmask8 __mask,
+ void const *__addr, int __scale, int __hint)
+{
+ __builtin_ia32_gatherpfqpd (__mask, (__v8di) __index, __addr, __scale,
+ __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_prefetch_i64gather_ps (__m512i __index, __mmask8 __mask,
+ void const *__addr, int __scale, int __hint)
+{
+ __builtin_ia32_gatherpfqps (__mask, (__v8di) __index, __addr, __scale,
+ __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_prefetch_i32scatter_pd (void *__addr, __m256i __index, int __scale,
+ int __hint)
+{
+ __builtin_ia32_scatterpfdpd ((__mmask8) 0xFF, (__v8si) __index, __addr,
+ __scale, __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_prefetch_i32scatter_ps (void *__addr, __m512i __index, int __scale,
+ int __hint)
+{
+ __builtin_ia32_scatterpfdps ((__mmask16) 0xFFFF, (__v16si) __index, __addr,
+ __scale, __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_prefetch_i32scatter_pd (void *__addr, __mmask8 __mask,
+ __m256i __index, int __scale, int __hint)
+{
+ __builtin_ia32_scatterpfdpd (__mask, (__v8si) __index, __addr, __scale,
+ __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_prefetch_i32scatter_ps (void *__addr, __mmask16 __mask,
+ __m512i __index, int __scale, int __hint)
+{
+ __builtin_ia32_scatterpfdps (__mask, (__v16si) __index, __addr, __scale,
+ __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_prefetch_i64scatter_pd (void *__addr, __m512i __index, int __scale,
+ int __hint)
+{
+ __builtin_ia32_scatterpfqpd ((__mmask8) 0xFF, (__v8di) __index,__addr,
+ __scale, __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_prefetch_i64scatter_ps (void *__addr, __m512i __index, int __scale,
+ int __hint)
+{
+ __builtin_ia32_scatterpfqps ((__mmask8) 0xFF, (__v8di) __index, __addr,
+ __scale, __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_prefetch_i64scatter_pd (void *__addr, __mmask8 __mask,
+ __m512i __index, int __scale, int __hint)
+{
+ __builtin_ia32_scatterpfqpd (__mask, (__v8di) __index, __addr, __scale,
+ __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_prefetch_i64scatter_ps (void *__addr, __mmask8 __mask,
+ __m512i __index, int __scale, int __hint)
+{
+ __builtin_ia32_scatterpfqps (__mask, (__v8di) __index, __addr, __scale,
+ __hint);
+}
+
+#else
+#define _mm512_prefetch_i32gather_pd(INDEX, ADDR, SCALE, HINT) \
+ __builtin_ia32_gatherpfdpd ((__mmask8)0xFF, (__v8si)(__m256i) (INDEX), \
+ (void const *) (ADDR), (int) (SCALE), \
+ (int) (HINT))
+
+#define _mm512_prefetch_i32gather_ps(INDEX, ADDR, SCALE, HINT) \
+ __builtin_ia32_gatherpfdps ((__mmask16)0xFFFF, (__v16si)(__m512i) (INDEX), \
+ (void const *) (ADDR), (int) (SCALE), \
+ (int) (HINT))
+
+#define _mm512_mask_prefetch_i32gather_pd(INDEX, MASK, ADDR, SCALE, HINT) \
+ __builtin_ia32_gatherpfdpd ((__mmask8) (MASK), (__v8si)(__m256i) (INDEX), \
+ (void const *) (ADDR), (int) (SCALE), \
+ (int) (HINT))
+
+#define _mm512_mask_prefetch_i32gather_ps(INDEX, MASK, ADDR, SCALE, HINT) \
+ __builtin_ia32_gatherpfdps ((__mmask16) (MASK), (__v16si)(__m512i) (INDEX),\
+ (void const *) (ADDR), (int) (SCALE), \
+ (int) (HINT))
+
+#define _mm512_prefetch_i64gather_pd(INDEX, ADDR, SCALE, HINT) \
+ __builtin_ia32_gatherpfqpd ((__mmask8)0xFF, (__v8di)(__m512i) (INDEX), \
+ (void *) (ADDR), (int) (SCALE), (int) (HINT))
+
+#define _mm512_prefetch_i64gather_ps(INDEX, ADDR, SCALE, HINT) \
+ __builtin_ia32_gatherpfqps ((__mmask8)0xFF, (__v8di)(__m512i) (INDEX), \
+ (void *) (ADDR), (int) (SCALE), (int) (HINT))
+
+#define _mm512_mask_prefetch_i64gather_pd(INDEX, MASK, ADDR, SCALE, HINT) \
+ __builtin_ia32_gatherpfqpd ((__mmask8) (MASK), (__v8di)(__m512i) (INDEX), \
+ (void *) (ADDR), (int) (SCALE), (int) (HINT))
+
+#define _mm512_mask_prefetch_i64gather_ps(INDEX, MASK, ADDR, SCALE, HINT) \
+ __builtin_ia32_gatherpfqps ((__mmask8) (MASK), (__v8di)(__m512i) (INDEX), \
+ (void *) (ADDR), (int) (SCALE), (int) (HINT))
+
+#define _mm512_prefetch_i32scatter_pd(ADDR, INDEX, SCALE, HINT) \
+ __builtin_ia32_scatterpfdpd ((__mmask8)0xFF, (__v8si)(__m256i) (INDEX), \
+ (void *) (ADDR), (int) (SCALE), (int) (HINT))
+
+#define _mm512_prefetch_i32scatter_ps(ADDR, INDEX, SCALE, HINT) \
+ __builtin_ia32_scatterpfdps ((__mmask16)0xFFFF, (__v16si)(__m512i) (INDEX),\
+ (void *) (ADDR), (int) (SCALE), (int) (HINT))
+
+#define _mm512_mask_prefetch_i32scatter_pd(ADDR, MASK, INDEX, SCALE, HINT) \
+ __builtin_ia32_scatterpfdpd ((__mmask8) (MASK), (__v8si)(__m256i) (INDEX), \
+ (void *) (ADDR), (int) (SCALE), (int) (HINT))
+
+#define _mm512_mask_prefetch_i32scatter_ps(ADDR, MASK, INDEX, SCALE, HINT) \
+ __builtin_ia32_scatterpfdps ((__mmask16) (MASK), \
+ (__v16si)(__m512i) (INDEX), \
+ (void *) (ADDR), (int) (SCALE), (int) (HINT))
+
+#define _mm512_prefetch_i64scatter_pd(ADDR, INDEX, SCALE, HINT) \
+ __builtin_ia32_scatterpfqpd ((__mmask8)0xFF, (__v8di)(__m512i) (INDEX), \
+ (void *) (ADDR), (int) (SCALE), (int) (HINT))
+
+#define _mm512_prefetch_i64scatter_ps(ADDR, INDEX, SCALE, HINT) \
+ __builtin_ia32_scatterpfqps ((__mmask8)0xFF, (__v8di)(__m512i) (INDEX), \
+ (void *) (ADDR), (int) (SCALE), (int) (HINT))
+
+#define _mm512_mask_prefetch_i64scatter_pd(ADDR, MASK, INDEX, SCALE, HINT) \
+ __builtin_ia32_scatterpfqpd ((__mmask8) (MASK), (__v8di)(__m512i) (INDEX), \
+ (void *) (ADDR), (int) (SCALE), (int) (HINT))
+
+#define _mm512_mask_prefetch_i64scatter_ps(ADDR, MASK, INDEX, SCALE, HINT) \
+ __builtin_ia32_scatterpfqps ((__mmask8) (MASK), (__v8di)(__m512i) (INDEX), \
+ (void *) (ADDR), (int) (SCALE), (int) (HINT))
+#endif
+
+#ifdef __DISABLE_AVX512PF__
+#undef __DISABLE_AVX512PF__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512PF__ */
+
+#endif /* _AVX512PFINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2013-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512vbmi2intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VBMI2INTRIN_H_INCLUDED
+#define __AVX512VBMI2INTRIN_H_INCLUDED
+
+#if !defined(__AVX512VBMI2__)
+#pragma GCC push_options
+#pragma GCC target("avx512vbmi2")
+#define __DISABLE_AVX512VBMI2__
+#endif /* __AVX512VBMI2__ */
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shrdi_epi16 (__m512i __A, __m512i __B, int __C)
+{
+ return (__m512i) __builtin_ia32_vpshrd_v32hi ((__v32hi)__A, (__v32hi) __B,
+ __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shrdi_epi32 (__m512i __A, __m512i __B, int __C)
+{
+ return (__m512i) __builtin_ia32_vpshrd_v16si ((__v16si)__A, (__v16si) __B,
+ __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shrdi_epi32 (__m512i __A, __mmask16 __B, __m512i __C, __m512i __D,
+ int __E)
+{
+ return (__m512i)__builtin_ia32_vpshrd_v16si_mask ((__v16si)__C,
+ (__v16si) __D, __E, (__v16si) __A, (__mmask16)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shrdi_epi32 (__mmask16 __A, __m512i __B, __m512i __C, int __D)
+{
+ return (__m512i)__builtin_ia32_vpshrd_v16si_mask ((__v16si)__B,
+ (__v16si) __C, __D, (__v16si) _mm512_setzero_si512 (), (__mmask16)__A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shrdi_epi64 (__m512i __A, __m512i __B, int __C)
+{
+ return (__m512i) __builtin_ia32_vpshrd_v8di ((__v8di)__A, (__v8di) __B, __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shrdi_epi64 (__m512i __A, __mmask8 __B, __m512i __C, __m512i __D,
+ int __E)
+{
+ return (__m512i)__builtin_ia32_vpshrd_v8di_mask ((__v8di)__C, (__v8di) __D,
+ __E, (__v8di) __A, (__mmask8)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shrdi_epi64 (__mmask8 __A, __m512i __B, __m512i __C, int __D)
+{
+ return (__m512i)__builtin_ia32_vpshrd_v8di_mask ((__v8di)__B, (__v8di) __C,
+ __D, (__v8di) _mm512_setzero_si512 (), (__mmask8)__A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shldi_epi16 (__m512i __A, __m512i __B, int __C)
+{
+ return (__m512i) __builtin_ia32_vpshld_v32hi ((__v32hi)__A, (__v32hi) __B,
+ __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shldi_epi32 (__m512i __A, __m512i __B, int __C)
+{
+ return (__m512i) __builtin_ia32_vpshld_v16si ((__v16si)__A, (__v16si) __B,
+ __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shldi_epi32 (__m512i __A, __mmask16 __B, __m512i __C, __m512i __D,
+ int __E)
+{
+ return (__m512i)__builtin_ia32_vpshld_v16si_mask ((__v16si)__C,
+ (__v16si) __D, __E, (__v16si) __A, (__mmask16)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shldi_epi32 (__mmask16 __A, __m512i __B, __m512i __C, int __D)
+{
+ return (__m512i)__builtin_ia32_vpshld_v16si_mask ((__v16si)__B,
+ (__v16si) __C, __D, (__v16si) _mm512_setzero_si512 (), (__mmask16)__A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shldi_epi64 (__m512i __A, __m512i __B, int __C)
+{
+ return (__m512i) __builtin_ia32_vpshld_v8di ((__v8di)__A, (__v8di) __B, __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shldi_epi64 (__m512i __A, __mmask8 __B, __m512i __C, __m512i __D,
+ int __E)
+{
+ return (__m512i)__builtin_ia32_vpshld_v8di_mask ((__v8di)__C, (__v8di) __D,
+ __E, (__v8di) __A, (__mmask8)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shldi_epi64 (__mmask8 __A, __m512i __B, __m512i __C, int __D)
+{
+ return (__m512i)__builtin_ia32_vpshld_v8di_mask ((__v8di)__B, (__v8di) __C,
+ __D, (__v8di) _mm512_setzero_si512 (), (__mmask8)__A);
+}
+#else
+#define _mm512_shrdi_epi16(A, B, C) \
+ ((__m512i) __builtin_ia32_vpshrd_v32hi ((__v32hi)(__m512i)(A), \
+ (__v32hi)(__m512i)(B),(int)(C)))
+#define _mm512_shrdi_epi32(A, B, C) \
+ ((__m512i) __builtin_ia32_vpshrd_v16si ((__v16si)(__m512i)(A), \
+ (__v16si)(__m512i)(B),(int)(C)))
+#define _mm512_mask_shrdi_epi32(A, B, C, D, E) \
+ ((__m512i) __builtin_ia32_vpshrd_v16si_mask ((__v16si)(__m512i)(C), \
+ (__v16si)(__m512i)(D), \
+ (int)(E), \
+ (__v16si)(__m512i)(A), \
+ (__mmask16)(B)))
+#define _mm512_maskz_shrdi_epi32(A, B, C, D) \
+ ((__m512i) \
+ __builtin_ia32_vpshrd_v16si_mask ((__v16si)(__m512i)(B), \
+ (__v16si)(__m512i)(C),(int)(D), \
+ (__v16si)(__m512i)_mm512_setzero_si512 (), \
+ (__mmask16)(A)))
+#define _mm512_shrdi_epi64(A, B, C) \
+ ((__m512i) __builtin_ia32_vpshrd_v8di ((__v8di)(__m512i)(A), \
+ (__v8di)(__m512i)(B),(int)(C)))
+#define _mm512_mask_shrdi_epi64(A, B, C, D, E) \
+ ((__m512i) __builtin_ia32_vpshrd_v8di_mask ((__v8di)(__m512i)(C), \
+ (__v8di)(__m512i)(D), (int)(E), \
+ (__v8di)(__m512i)(A), \
+ (__mmask8)(B)))
+#define _mm512_maskz_shrdi_epi64(A, B, C, D) \
+ ((__m512i) \
+ __builtin_ia32_vpshrd_v8di_mask ((__v8di)(__m512i)(B), \
+ (__v8di)(__m512i)(C),(int)(D), \
+ (__v8di)(__m512i)_mm512_setzero_si512 (), \
+ (__mmask8)(A)))
+#define _mm512_shldi_epi16(A, B, C) \
+ ((__m512i) __builtin_ia32_vpshld_v32hi ((__v32hi)(__m512i)(A), \
+ (__v32hi)(__m512i)(B),(int)(C)))
+#define _mm512_shldi_epi32(A, B, C) \
+ ((__m512i) __builtin_ia32_vpshld_v16si ((__v16si)(__m512i)(A), \
+ (__v16si)(__m512i)(B),(int)(C)))
+#define _mm512_mask_shldi_epi32(A, B, C, D, E) \
+ ((__m512i) __builtin_ia32_vpshld_v16si_mask ((__v16si)(__m512i)(C), \
+ (__v16si)(__m512i)(D), \
+ (int)(E), \
+ (__v16si)(__m512i)(A), \
+ (__mmask16)(B)))
+#define _mm512_maskz_shldi_epi32(A, B, C, D) \
+ ((__m512i) \
+ __builtin_ia32_vpshld_v16si_mask ((__v16si)(__m512i)(B), \
+ (__v16si)(__m512i)(C),(int)(D), \
+ (__v16si)(__m512i)_mm512_setzero_si512 (), \
+ (__mmask16)(A)))
+#define _mm512_shldi_epi64(A, B, C) \
+ ((__m512i) __builtin_ia32_vpshld_v8di ((__v8di)(__m512i)(A), \
+ (__v8di)(__m512i)(B), (int)(C)))
+#define _mm512_mask_shldi_epi64(A, B, C, D, E) \
+ ((__m512i) __builtin_ia32_vpshld_v8di_mask ((__v8di)(__m512i)(C), \
+ (__v8di)(__m512i)(D), (int)(E), \
+ (__v8di)(__m512i)(A), \
+ (__mmask8)(B)))
+#define _mm512_maskz_shldi_epi64(A, B, C, D) \
+ ((__m512i) \
+ __builtin_ia32_vpshld_v8di_mask ((__v8di)(__m512i)(B), \
+ (__v8di)(__m512i)(C),(int)(D), \
+ (__v8di)(__m512i)_mm512_setzero_si512 (), \
+ (__mmask8)(A)))
+#endif
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shrdv_epi16 (__m512i __A, __m512i __B, __m512i __C)
+{
+ return (__m512i) __builtin_ia32_vpshrdv_v32hi ((__v32hi)__A, (__v32hi) __B,
+ (__v32hi) __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shrdv_epi32 (__m512i __A, __m512i __B, __m512i __C)
+{
+ return (__m512i) __builtin_ia32_vpshrdv_v16si ((__v16si)__A, (__v16si) __B,
+ (__v16si) __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shrdv_epi32 (__m512i __A, __mmask16 __B, __m512i __C, __m512i __D)
+{
+ return (__m512i)__builtin_ia32_vpshrdv_v16si_mask ((__v16si)__A,
+ (__v16si) __C, (__v16si) __D, (__mmask16)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shrdv_epi32 (__mmask16 __A, __m512i __B, __m512i __C, __m512i __D)
+{
+ return (__m512i)__builtin_ia32_vpshrdv_v16si_maskz ((__v16si)__B,
+ (__v16si) __C, (__v16si) __D, (__mmask16)__A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shrdv_epi64 (__m512i __A, __m512i __B, __m512i __C)
+{
+ return (__m512i) __builtin_ia32_vpshrdv_v8di ((__v8di)__A, (__v8di) __B,
+ (__v8di) __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shrdv_epi64 (__m512i __A, __mmask8 __B, __m512i __C, __m512i __D)
+{
+ return (__m512i)__builtin_ia32_vpshrdv_v8di_mask ((__v8di)__A, (__v8di) __C,
+ (__v8di) __D, (__mmask8)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shrdv_epi64 (__mmask8 __A, __m512i __B, __m512i __C, __m512i __D)
+{
+ return (__m512i)__builtin_ia32_vpshrdv_v8di_maskz ((__v8di)__B, (__v8di) __C,
+ (__v8di) __D, (__mmask8)__A);
+}
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shldv_epi16 (__m512i __A, __m512i __B, __m512i __C)
+{
+ return (__m512i) __builtin_ia32_vpshldv_v32hi ((__v32hi)__A, (__v32hi) __B,
+ (__v32hi) __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shldv_epi32 (__m512i __A, __m512i __B, __m512i __C)
+{
+ return (__m512i) __builtin_ia32_vpshldv_v16si ((__v16si)__A, (__v16si) __B,
+ (__v16si) __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shldv_epi32 (__m512i __A, __mmask16 __B, __m512i __C, __m512i __D)
+{
+ return (__m512i)__builtin_ia32_vpshldv_v16si_mask ((__v16si)__A,
+ (__v16si) __C, (__v16si) __D, (__mmask16)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shldv_epi32 (__mmask16 __A, __m512i __B, __m512i __C, __m512i __D)
+{
+ return (__m512i)__builtin_ia32_vpshldv_v16si_maskz ((__v16si)__B,
+ (__v16si) __C, (__v16si) __D, (__mmask16)__A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shldv_epi64 (__m512i __A, __m512i __B, __m512i __C)
+{
+ return (__m512i) __builtin_ia32_vpshldv_v8di ((__v8di)__A, (__v8di) __B,
+ (__v8di) __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shldv_epi64 (__m512i __A, __mmask8 __B, __m512i __C, __m512i __D)
+{
+ return (__m512i)__builtin_ia32_vpshldv_v8di_mask ((__v8di)__A, (__v8di) __C,
+ (__v8di) __D, (__mmask8)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shldv_epi64 (__mmask8 __A, __m512i __B, __m512i __C, __m512i __D)
+{
+ return (__m512i)__builtin_ia32_vpshldv_v8di_maskz ((__v8di)__B, (__v8di) __C,
+ (__v8di) __D, (__mmask8)__A);
+}
+
+#ifdef __DISABLE_AVX512VBMI2__
+#undef __DISABLE_AVX512VBMI2__
+
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512VBMI2__ */
+
+#if !defined(__AVX512VBMI2__) || !defined(__AVX512BW__)
+#pragma GCC push_options
+#pragma GCC target("avx512vbmi2,avx512bw")
+#define __DISABLE_AVX512VBMI2BW__
+#endif /* __AVX512VBMI2BW__ */
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_compress_epi8 (__m512i __A, __mmask64 __B, __m512i __C)
+{
+ return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi)__C,
+ (__v64qi)__A, (__mmask64)__B);
+}
+
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_compress_epi8 (__mmask64 __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi)__B,
+ (__v64qi)_mm512_setzero_si512 (), (__mmask64)__A);
+}
+
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_compressstoreu_epi8 (void * __A, __mmask64 __B, __m512i __C)
+{
+ __builtin_ia32_compressstoreuqi512_mask ((__v64qi *) __A, (__v64qi) __C,
+ (__mmask64) __B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_compress_epi16 (__m512i __A, __mmask32 __B, __m512i __C)
+{
+ return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi)__C,
+ (__v32hi)__A, (__mmask32)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_compress_epi16 (__mmask32 __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi)__B,
+ (__v32hi)_mm512_setzero_si512 (), (__mmask32)__A);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_compressstoreu_epi16 (void * __A, __mmask32 __B, __m512i __C)
+{
+ __builtin_ia32_compressstoreuhi512_mask ((__v32hi *) __A, (__v32hi) __C,
+ (__mmask32) __B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_expand_epi8 (__m512i __A, __mmask64 __B, __m512i __C)
+{
+ return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __C,
+ (__v64qi) __A,
+ (__mmask64) __B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_expand_epi8 (__mmask64 __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_expandqi512_maskz ((__v64qi) __B,
+ (__v64qi) _mm512_setzero_si512 (), (__mmask64) __A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_expandloadu_epi8 (__m512i __A, __mmask64 __B, const void * __C)
+{
+ return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *) __C,
+ (__v64qi) __A, (__mmask64) __B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_expandloadu_epi8 (__mmask64 __A, const void * __B)
+{
+ return (__m512i) __builtin_ia32_expandloadqi512_maskz ((const __v64qi *) __B,
+ (__v64qi) _mm512_setzero_si512 (), (__mmask64) __A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_expand_epi16 (__m512i __A, __mmask32 __B, __m512i __C)
+{
+ return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __C,
+ (__v32hi) __A,
+ (__mmask32) __B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_expand_epi16 (__mmask32 __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_expandhi512_maskz ((__v32hi) __B,
+ (__v32hi) _mm512_setzero_si512 (), (__mmask32) __A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_expandloadu_epi16 (__m512i __A, __mmask32 __B, const void * __C)
+{
+ return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *) __C,
+ (__v32hi) __A, (__mmask32) __B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_expandloadu_epi16 (__mmask32 __A, const void * __B)
+{
+ return (__m512i) __builtin_ia32_expandloadhi512_maskz ((const __v32hi *) __B,
+ (__v32hi) _mm512_setzero_si512 (), (__mmask32) __A);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shrdi_epi16 (__m512i __A, __mmask32 __B, __m512i __C, __m512i __D,
+ int __E)
+{
+ return (__m512i)__builtin_ia32_vpshrd_v32hi_mask ((__v32hi)__C,
+ (__v32hi) __D, __E, (__v32hi) __A, (__mmask32)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shrdi_epi16 (__mmask32 __A, __m512i __B, __m512i __C, int __D)
+{
+ return (__m512i)__builtin_ia32_vpshrd_v32hi_mask ((__v32hi)__B,
+ (__v32hi) __C, __D, (__v32hi) _mm512_setzero_si512 (), (__mmask32)__A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shldi_epi16 (__m512i __A, __mmask32 __B, __m512i __C, __m512i __D,
+ int __E)
+{
+ return (__m512i)__builtin_ia32_vpshld_v32hi_mask ((__v32hi)__C,
+ (__v32hi) __D, __E, (__v32hi) __A, (__mmask32)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shldi_epi16 (__mmask32 __A, __m512i __B, __m512i __C, int __D)
+{
+ return (__m512i)__builtin_ia32_vpshld_v32hi_mask ((__v32hi)__B,
+ (__v32hi) __C, __D, (__v32hi) _mm512_setzero_si512 (), (__mmask32)__A);
+}
+
+#else
+#define _mm512_mask_shrdi_epi16(A, B, C, D, E) \
+ ((__m512i) __builtin_ia32_vpshrd_v32hi_mask ((__v32hi)(__m512i)(C), \
+ (__v32hi)(__m512i)(D), \
+ (int)(E), \
+ (__v32hi)(__m512i)(A), \
+ (__mmask32)(B)))
+#define _mm512_maskz_shrdi_epi16(A, B, C, D) \
+ ((__m512i) \
+ __builtin_ia32_vpshrd_v32hi_mask ((__v32hi)(__m512i)(B), \
+ (__v32hi)(__m512i)(C),(int)(D), \
+ (__v32hi)(__m512i)_mm512_setzero_si512 (), \
+ (__mmask32)(A)))
+#define _mm512_mask_shldi_epi16(A, B, C, D, E) \
+ ((__m512i) __builtin_ia32_vpshld_v32hi_mask ((__v32hi)(__m512i)(C), \
+ (__v32hi)(__m512i)(D), \
+ (int)(E), \
+ (__v32hi)(__m512i)(A), \
+ (__mmask32)(B)))
+#define _mm512_maskz_shldi_epi16(A, B, C, D) \
+ ((__m512i) \
+ __builtin_ia32_vpshld_v32hi_mask ((__v32hi)(__m512i)(B), \
+ (__v32hi)(__m512i)(C),(int)(D), \
+ (__v32hi)(__m512i)_mm512_setzero_si512 (), \
+ (__mmask32)(A)))
+#endif
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shrdv_epi16 (__m512i __A, __mmask32 __B, __m512i __C, __m512i __D)
+{
+ return (__m512i)__builtin_ia32_vpshrdv_v32hi_mask ((__v32hi)__A,
+ (__v32hi) __C, (__v32hi) __D, (__mmask32)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shrdv_epi16 (__mmask32 __A, __m512i __B, __m512i __C, __m512i __D)
+{
+ return (__m512i)__builtin_ia32_vpshrdv_v32hi_maskz ((__v32hi)__B,
+ (__v32hi) __C, (__v32hi) __D, (__mmask32)__A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shldv_epi16 (__m512i __A, __mmask32 __B, __m512i __C, __m512i __D)
+{
+ return (__m512i)__builtin_ia32_vpshldv_v32hi_mask ((__v32hi)__A,
+ (__v32hi) __C, (__v32hi) __D, (__mmask32)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shldv_epi16 (__mmask32 __A, __m512i __B, __m512i __C, __m512i __D)
+{
+ return (__m512i)__builtin_ia32_vpshldv_v32hi_maskz ((__v32hi)__B,
+ (__v32hi) __C, (__v32hi) __D, (__mmask32)__A);
+}
+
+#ifdef __DISABLE_AVX512VBMI2BW__
+#undef __DISABLE_AVX512VBMI2BW__
+
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512VBMI2BW__ */
+
+#endif /* __AVX512VBMI2INTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2013-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512vbmi2vlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512VBMI2VLINTRIN_H_INCLUDED
+#define _AVX512VBMI2VLINTRIN_H_INCLUDED
+
+#if !defined(__AVX512VL__) || !defined(__AVX512VBMI2__)
+#pragma GCC push_options
+#pragma GCC target("avx512vbmi2,avx512vl")
+#define __DISABLE_AVX512VBMI2VL__
+#endif /* __AVX512VBMIVL__ */
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_compress_epi8 (__m128i __A, __mmask16 __B, __m128i __C)
+{
+ return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi)__C,
+ (__v16qi)__A, (__mmask16)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_compress_epi8 (__mmask16 __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __B,
+ (__v16qi) _mm_setzero_si128 (), (__mmask16) __A);
+}
+
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_compressstoreu_epi16 (void * __A, __mmask16 __B, __m256i __C)
+{
+ __builtin_ia32_compressstoreuhi256_mask ((__v16hi *) __A, (__v16hi) __C,
+ (__mmask16) __B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_compress_epi16 (__m128i __A, __mmask8 __B, __m128i __C)
+{
+ return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi)__C, (__v8hi)__A,
+ (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_compress_epi16 (__mmask8 __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __B,
+ (__v8hi) _mm_setzero_si128 (), (__mmask8) __A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_compress_epi16 (__m256i __A, __mmask16 __B, __m256i __C)
+{
+ return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi)__C,
+ (__v16hi)__A, (__mmask16)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_compress_epi16 (__mmask16 __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __B,
+ (__v16hi) _mm256_setzero_si256 (), (__mmask16) __A);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_compressstoreu_epi8 (void * __A, __mmask16 __B, __m128i __C)
+{
+ __builtin_ia32_compressstoreuqi128_mask ((__v16qi *) __A, (__v16qi) __C,
+ (__mmask16) __B);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_compressstoreu_epi16 (void * __A, __mmask8 __B, __m128i __C)
+{
+ __builtin_ia32_compressstoreuhi128_mask ((__v8hi *) __A, (__v8hi) __C,
+ (__mmask8) __B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_expand_epi8 (__m128i __A, __mmask16 __B, __m128i __C)
+{
+ return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __C,
+ (__v16qi) __A,
+ (__mmask16) __B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_expand_epi8 (__mmask16 __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_expandqi128_maskz ((__v16qi) __B,
+ (__v16qi) _mm_setzero_si128 (), (__mmask16) __A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_expandloadu_epi8 (__m128i __A, __mmask16 __B, const void * __C)
+{
+ return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *) __C,
+ (__v16qi) __A, (__mmask16) __B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_expandloadu_epi8 (__mmask16 __A, const void * __B)
+{
+ return (__m128i) __builtin_ia32_expandloadqi128_maskz ((const __v16qi *) __B,
+ (__v16qi) _mm_setzero_si128 (), (__mmask16) __A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_expand_epi16 (__m128i __A, __mmask8 __B, __m128i __C)
+{
+ return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __C,
+ (__v8hi) __A,
+ (__mmask8) __B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_expand_epi16 (__mmask8 __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_expandhi128_maskz ((__v8hi) __B,
+ (__v8hi) _mm_setzero_si128 (), (__mmask8) __A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_expandloadu_epi16 (__m128i __A, __mmask8 __B, const void * __C)
+{
+ return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *) __C,
+ (__v8hi) __A, (__mmask8) __B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_expandloadu_epi16 (__mmask8 __A, const void * __B)
+{
+ return (__m128i) __builtin_ia32_expandloadhi128_maskz ((const __v8hi *) __B,
+ (__v8hi) _mm_setzero_si128 (), (__mmask8) __A);
+}
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_expand_epi16 (__m256i __A, __mmask16 __B, __m256i __C)
+{
+ return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __C,
+ (__v16hi) __A,
+ (__mmask16) __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_expand_epi16 (__mmask16 __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_expandhi256_maskz ((__v16hi) __B,
+ (__v16hi) _mm256_setzero_si256 (), (__mmask16) __A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_expandloadu_epi16 (__m256i __A, __mmask16 __B, const void * __C)
+{
+ return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *) __C,
+ (__v16hi) __A, (__mmask16) __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_expandloadu_epi16 (__mmask16 __A, const void * __B)
+{
+ return (__m256i) __builtin_ia32_expandloadhi256_maskz ((const __v16hi *) __B,
+ (__v16hi) _mm256_setzero_si256 (), (__mmask16) __A);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shrdi_epi16 (__m256i __A, __m256i __B, int __C)
+{
+ return (__m256i) __builtin_ia32_vpshrd_v16hi ((__v16hi)__A, (__v16hi) __B,
+ __C);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shrdi_epi16 (__m256i __A, __mmask16 __B, __m256i __C, __m256i __D,
+ int __E)
+{
+ return (__m256i)__builtin_ia32_vpshrd_v16hi_mask ((__v16hi)__C,
+ (__v16hi) __D, __E, (__v16hi) __A, (__mmask16)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shrdi_epi16 (__mmask16 __A, __m256i __B, __m256i __C, int __D)
+{
+ return (__m256i)__builtin_ia32_vpshrd_v16hi_mask ((__v16hi)__B,
+ (__v16hi) __C, __D, (__v16hi) _mm256_setzero_si256 (), (__mmask16)__A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shrdi_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D,
+ int __E)
+{
+ return (__m256i)__builtin_ia32_vpshrd_v8si_mask ((__v8si)__C, (__v8si) __D,
+ __E, (__v8si) __A, (__mmask8)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shrdi_epi32 (__mmask8 __A, __m256i __B, __m256i __C, int __D)
+{
+ return (__m256i)__builtin_ia32_vpshrd_v8si_mask ((__v8si)__B, (__v8si) __C,
+ __D, (__v8si) _mm256_setzero_si256 (), (__mmask8)__A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shrdi_epi32 (__m256i __A, __m256i __B, int __C)
+{
+ return (__m256i) __builtin_ia32_vpshrd_v8si ((__v8si)__A, (__v8si) __B, __C);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shrdi_epi64 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D,
+ int __E)
+{
+ return (__m256i)__builtin_ia32_vpshrd_v4di_mask ((__v4di)__C, (__v4di) __D,
+ __E, (__v4di) __A, (__mmask8)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shrdi_epi64 (__mmask8 __A, __m256i __B, __m256i __C, int __D)
+{
+ return (__m256i)__builtin_ia32_vpshrd_v4di_mask ((__v4di)__B, (__v4di) __C,
+ __D, (__v4di) _mm256_setzero_si256 (), (__mmask8)__A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shrdi_epi64 (__m256i __A, __m256i __B, int __C)
+{
+ return (__m256i) __builtin_ia32_vpshrd_v4di ((__v4di)__A, (__v4di) __B, __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shrdi_epi16 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D,
+ int __E)
+{
+ return (__m128i)__builtin_ia32_vpshrd_v8hi_mask ((__v8hi)__C, (__v8hi) __D,
+ __E, (__v8hi) __A, (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shrdi_epi16 (__mmask8 __A, __m128i __B, __m128i __C, int __D)
+{
+ return (__m128i)__builtin_ia32_vpshrd_v8hi_mask ((__v8hi)__B, (__v8hi) __C,
+ __D, (__v8hi) _mm_setzero_si128 (), (__mmask8)__A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shrdi_epi16 (__m128i __A, __m128i __B, int __C)
+{
+ return (__m128i) __builtin_ia32_vpshrd_v8hi ((__v8hi)__A, (__v8hi) __B, __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shrdi_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D,
+ int __E)
+{
+ return (__m128i)__builtin_ia32_vpshrd_v4si_mask ((__v4si)__C, (__v4si) __D,
+ __E, (__v4si) __A, (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shrdi_epi32 (__mmask8 __A, __m128i __B, __m128i __C, int __D)
+{
+ return (__m128i)__builtin_ia32_vpshrd_v4si_mask ((__v4si)__B, (__v4si) __C,
+ __D, (__v4si) _mm_setzero_si128 (), (__mmask8)__A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shrdi_epi32 (__m128i __A, __m128i __B, int __C)
+{
+ return (__m128i) __builtin_ia32_vpshrd_v4si ((__v4si)__A, (__v4si) __B, __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shrdi_epi64 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D,
+ int __E)
+{
+ return (__m128i)__builtin_ia32_vpshrd_v2di_mask ((__v2di)__C, (__v2di) __D,
+ __E, (__v2di) __A, (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shrdi_epi64 (__mmask8 __A, __m128i __B, __m128i __C, int __D)
+{
+ return (__m128i)__builtin_ia32_vpshrd_v2di_mask ((__v2di)__B, (__v2di) __C,
+ __D, (__v2di) _mm_setzero_si128 (), (__mmask8)__A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shrdi_epi64 (__m128i __A, __m128i __B, int __C)
+{
+ return (__m128i) __builtin_ia32_vpshrd_v2di ((__v2di)__A, (__v2di) __B, __C);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shldi_epi16 (__m256i __A, __m256i __B, int __C)
+{
+ return (__m256i) __builtin_ia32_vpshld_v16hi ((__v16hi)__A, (__v16hi) __B,
+ __C);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shldi_epi16 (__m256i __A, __mmask16 __B, __m256i __C, __m256i __D,
+ int __E)
+{
+ return (__m256i)__builtin_ia32_vpshld_v16hi_mask ((__v16hi)__C,
+ (__v16hi) __D, __E, (__v16hi) __A, (__mmask16)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shldi_epi16 (__mmask16 __A, __m256i __B, __m256i __C, int __D)
+{
+ return (__m256i)__builtin_ia32_vpshld_v16hi_mask ((__v16hi)__B,
+ (__v16hi) __C, __D, (__v16hi) _mm256_setzero_si256 (), (__mmask16)__A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shldi_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D,
+ int __E)
+{
+ return (__m256i)__builtin_ia32_vpshld_v8si_mask ((__v8si)__C, (__v8si) __D,
+ __E, (__v8si) __A, (__mmask8)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shldi_epi32 (__mmask8 __A, __m256i __B, __m256i __C, int __D)
+{
+ return (__m256i)__builtin_ia32_vpshld_v8si_mask ((__v8si)__B, (__v8si) __C,
+ __D, (__v8si) _mm256_setzero_si256 (), (__mmask8)__A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shldi_epi32 (__m256i __A, __m256i __B, int __C)
+{
+ return (__m256i) __builtin_ia32_vpshld_v8si ((__v8si)__A, (__v8si) __B, __C);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shldi_epi64 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D,
+ int __E)
+{
+ return (__m256i)__builtin_ia32_vpshld_v4di_mask ((__v4di)__C, (__v4di) __D,
+ __E, (__v4di) __A, (__mmask8)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shldi_epi64 (__mmask8 __A, __m256i __B, __m256i __C, int __D)
+{
+ return (__m256i)__builtin_ia32_vpshld_v4di_mask ((__v4di)__B, (__v4di) __C,
+ __D, (__v4di) _mm256_setzero_si256 (), (__mmask8)__A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shldi_epi64 (__m256i __A, __m256i __B, int __C)
+{
+ return (__m256i) __builtin_ia32_vpshld_v4di ((__v4di)__A, (__v4di) __B, __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shldi_epi16 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D,
+ int __E)
+{
+ return (__m128i)__builtin_ia32_vpshld_v8hi_mask ((__v8hi)__C, (__v8hi) __D,
+ __E, (__v8hi) __A, (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shldi_epi16 (__mmask8 __A, __m128i __B, __m128i __C, int __D)
+{
+ return (__m128i)__builtin_ia32_vpshld_v8hi_mask ((__v8hi)__B, (__v8hi) __C,
+ __D, (__v8hi) _mm_setzero_si128 (), (__mmask8)__A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shldi_epi16 (__m128i __A, __m128i __B, int __C)
+{
+ return (__m128i) __builtin_ia32_vpshld_v8hi ((__v8hi)__A, (__v8hi) __B, __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shldi_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D,
+ int __E)
+{
+ return (__m128i)__builtin_ia32_vpshld_v4si_mask ((__v4si)__C, (__v4si) __D,
+ __E, (__v4si) __A, (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shldi_epi32 (__mmask8 __A, __m128i __B, __m128i __C, int __D)
+{
+ return (__m128i)__builtin_ia32_vpshld_v4si_mask ((__v4si)__B, (__v4si) __C,
+ __D, (__v4si) _mm_setzero_si128 (), (__mmask8)__A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shldi_epi32 (__m128i __A, __m128i __B, int __C)
+{
+ return (__m128i) __builtin_ia32_vpshld_v4si ((__v4si)__A, (__v4si) __B, __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shldi_epi64 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D,
+ int __E)
+{
+ return (__m128i)__builtin_ia32_vpshld_v2di_mask ((__v2di)__C, (__v2di) __D,
+ __E, (__v2di) __A, (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shldi_epi64 (__mmask8 __A, __m128i __B, __m128i __C, int __D)
+{
+ return (__m128i)__builtin_ia32_vpshld_v2di_mask ((__v2di)__B, (__v2di) __C,
+ __D, (__v2di) _mm_setzero_si128 (), (__mmask8)__A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shldi_epi64 (__m128i __A, __m128i __B, int __C)
+{
+ return (__m128i) __builtin_ia32_vpshld_v2di ((__v2di)__A, (__v2di) __B, __C);
+}
+#else
+#define _mm256_shrdi_epi16(A, B, C) \
+ ((__m256i) __builtin_ia32_vpshrd_v16hi ((__v16hi)(__m256i)(A), \
+ (__v16hi)(__m256i)(B),(int)(C)))
+#define _mm256_mask_shrdi_epi16(A, B, C, D, E) \
+ ((__m256i) __builtin_ia32_vpshrd_v16hi_mask ((__v16hi)(__m256i)(C), \
+ (__v16hi)(__m256i)(D), \
+ (int)(E), \
+ (__v16hi)(__m256i)(A), \
+ (__mmask16)(B)))
+#define _mm256_maskz_shrdi_epi16(A, B, C, D) \
+ ((__m256i) \
+ __builtin_ia32_vpshrd_v16hi_mask ((__v16hi)(__m256i)(B), \
+ (__v16hi)(__m256i)(C),(int)(D), \
+ (__v16hi)(__m256i)_mm256_setzero_si256 (), \
+ (__mmask16)(A)))
+#define _mm256_shrdi_epi32(A, B, C) \
+ ((__m256i) __builtin_ia32_vpshrd_v8si ((__v8si)(__m256i)(A), \
+ (__v8si)(__m256i)(B),(int)(C)))
+#define _mm256_mask_shrdi_epi32(A, B, C, D, E) \
+ ((__m256i) __builtin_ia32_vpshrd_v8si_mask ((__v8si)(__m256i)(C), \
+ (__v8si)(__m256i)(D), \
+ (int)(E), \
+ (__v8si)(__m256i)(A), \
+ (__mmask8)(B)))
+#define _mm256_maskz_shrdi_epi32(A, B, C, D) \
+ ((__m256i) \
+ __builtin_ia32_vpshrd_v8si_mask ((__v8si)(__m256i)(B), \
+ (__v8si)(__m256i)(C),(int)(D), \
+ (__v8si)(__m256i)_mm256_setzero_si256 (), \
+ (__mmask8)(A)))
+#define _mm256_shrdi_epi64(A, B, C) \
+ ((__m256i) __builtin_ia32_vpshrd_v4di ((__v4di)(__m256i)(A), \
+ (__v4di)(__m256i)(B),(int)(C)))
+#define _mm256_mask_shrdi_epi64(A, B, C, D, E) \
+ ((__m256i) __builtin_ia32_vpshrd_v4di_mask ((__v4di)(__m256i)(C), \
+ (__v4di)(__m256i)(D), (int)(E), \
+ (__v4di)(__m256i)(A), \
+ (__mmask8)(B)))
+#define _mm256_maskz_shrdi_epi64(A, B, C, D) \
+ ((__m256i) \
+ __builtin_ia32_vpshrd_v4di_mask ((__v4di)(__m256i)(B), \
+ (__v4di)(__m256i)(C),(int)(D), \
+ (__v4di)(__m256i)_mm256_setzero_si256 (), \
+ (__mmask8)(A)))
+#define _mm_shrdi_epi16(A, B, C) \
+ ((__m128i) __builtin_ia32_vpshrd_v8hi ((__v8hi)(__m128i)(A), \
+ (__v8hi)(__m128i)(B),(int)(C)))
+#define _mm_mask_shrdi_epi16(A, B, C, D, E) \
+ ((__m128i) __builtin_ia32_vpshrd_v8hi_mask ((__v8hi)(__m128i)(C), \
+ (__v8hi)(__m128i)(D), (int)(E), \
+ (__v8hi)(__m128i)(A), \
+ (__mmask8)(B)))
+#define _mm_maskz_shrdi_epi16(A, B, C, D) \
+ ((__m128i) \
+ __builtin_ia32_vpshrd_v8hi_mask ((__v8hi)(__m128i)(B), \
+ (__v8hi)(__m128i)(C),(int)(D), \
+ (__v8hi)(__m128i)_mm_setzero_si128 (), \
+ (__mmask8)(A)))
+#define _mm_shrdi_epi32(A, B, C) \
+ ((__m128i) __builtin_ia32_vpshrd_v4si ((__v4si)(__m128i)(A), \
+ (__v4si)(__m128i)(B),(int)(C)))
+#define _mm_mask_shrdi_epi32(A, B, C, D, E) \
+ ((__m128i) __builtin_ia32_vpshrd_v4si_mask ((__v4si)(__m128i)(C), \
+ (__v4si)(__m128i)(D), (int)(E), \
+ (__v4si)(__m128i)(A), \
+ (__mmask8)(B)))
+#define _mm_maskz_shrdi_epi32(A, B, C, D) \
+ ((__m128i) \
+ __builtin_ia32_vpshrd_v4si_mask ((__v4si)(__m128i)(B), \
+ (__v4si)(__m128i)(C),(int)(D), \
+ (__v4si)(__m128i)_mm_setzero_si128 (), \
+ (__mmask8)(A)))
+#define _mm_shrdi_epi64(A, B, C) \
+ ((__m128i) __builtin_ia32_vpshrd_v2di ((__v2di)(__m128i)(A), \
+ (__v2di)(__m128i)(B),(int)(C)))
+#define _mm_mask_shrdi_epi64(A, B, C, D, E) \
+ ((__m128i) __builtin_ia32_vpshrd_v2di_mask ((__v2di)(__m128i)(C), \
+ (__v2di)(__m128i)(D), (int)(E), \
+ (__v2di)(__m128i)(A), \
+ (__mmask8)(B)))
+#define _mm_maskz_shrdi_epi64(A, B, C, D) \
+ ((__m128i) \
+ __builtin_ia32_vpshrd_v2di_mask ((__v2di)(__m128i)(B), \
+ (__v2di)(__m128i)(C),(int)(D), \
+ (__v2di)(__m128i)_mm_setzero_si128 (), \
+ (__mmask8)(A)))
+#define _mm256_shldi_epi16(A, B, C) \
+ ((__m256i) __builtin_ia32_vpshld_v16hi ((__v16hi)(__m256i)(A), \
+ (__v16hi)(__m256i)(B),(int)(C)))
+#define _mm256_mask_shldi_epi16(A, B, C, D, E) \
+ ((__m256i) __builtin_ia32_vpshld_v16hi_mask ((__v16hi)(__m256i)(C), \
+ (__v16hi)(__m256i)(D), \
+ (int)(E), \
+ (__v16hi)(__m256i)(A), \
+ (__mmask16)(B)))
+#define _mm256_maskz_shldi_epi16(A, B, C, D) \
+ ((__m256i) \
+ __builtin_ia32_vpshld_v16hi_mask ((__v16hi)(__m256i)(B), \
+ (__v16hi)(__m256i)(C),(int)(D), \
+ (__v16hi)(__m256i)_mm256_setzero_si256 (), \
+ (__mmask16)(A)))
+#define _mm256_shldi_epi32(A, B, C) \
+ ((__m256i) __builtin_ia32_vpshld_v8si ((__v8si)(__m256i)(A), \
+ (__v8si)(__m256i)(B),(int)(C)))
+#define _mm256_mask_shldi_epi32(A, B, C, D, E) \
+ ((__m256i) __builtin_ia32_vpshld_v8si_mask ((__v8si)(__m256i)(C), \
+ (__v8si)(__m256i)(D), (int)(E), \
+ (__v8si)(__m256i)(A), \
+ (__mmask8)(B)))
+#define _mm256_maskz_shldi_epi32(A, B, C, D) \
+ ((__m256i) \
+ __builtin_ia32_vpshld_v8si_mask ((__v8si)(__m256i)(B), \
+ (__v8si)(__m256i)(C),(int)(D), \
+ (__v8si)(__m256i)_mm256_setzero_si256 (), \
+ (__mmask8)(A)))
+#define _mm256_shldi_epi64(A, B, C) \
+ ((__m256i) __builtin_ia32_vpshld_v4di ((__v4di)(__m256i)(A), \
+ (__v4di)(__m256i)(B),(int)(C)))
+#define _mm256_mask_shldi_epi64(A, B, C, D, E) \
+ ((__m256i) __builtin_ia32_vpshld_v4di_mask ((__v4di)(__m256i)(C), \
+ (__v4di)(__m256i)(D), (int)(E), \
+ (__v4di)(__m256i)(A), \
+ (__mmask8)(B)))
+#define _mm256_maskz_shldi_epi64(A, B, C, D) \
+ ((__m256i) \
+ __builtin_ia32_vpshld_v4di_mask ((__v4di)(__m256i)(B), \
+ (__v4di)(__m256i)(C),(int)(D), \
+ (__v4di)(__m256i)_mm256_setzero_si256 (), \
+ (__mmask8)(A)))
+#define _mm_shldi_epi16(A, B, C) \
+ ((__m128i) __builtin_ia32_vpshld_v8hi ((__v8hi)(__m128i)(A), \
+ (__v8hi)(__m128i)(B),(int)(C)))
+#define _mm_mask_shldi_epi16(A, B, C, D, E) \
+ ((__m128i) __builtin_ia32_vpshld_v8hi_mask ((__v8hi)(__m128i)(C), \
+ (__v8hi)(__m128i)(D), (int)(E), \
+ (__v8hi)(__m128i)(A), \
+ (__mmask8)(B)))
+#define _mm_maskz_shldi_epi16(A, B, C, D) \
+ ((__m128i) \
+ __builtin_ia32_vpshld_v8hi_mask ((__v8hi)(__m128i)(B), \
+ (__v8hi)(__m128i)(C),(int)(D), \
+ (__v8hi)(__m128i)_mm_setzero_si128 (), \
+ (__mmask8)(A)))
+#define _mm_shldi_epi32(A, B, C) \
+ ((__m128i) __builtin_ia32_vpshld_v4si ((__v4si)(__m128i)(A), \
+ (__v4si)(__m128i)(B),(int)(C)))
+#define _mm_mask_shldi_epi32(A, B, C, D, E) \
+ ((__m128i) __builtin_ia32_vpshld_v4si_mask ((__v4si)(__m128i)(C), \
+ (__v4si)(__m128i)(D), (int)(E), \
+ (__v4si)(__m128i)(A), \
+ (__mmask8)(B)))
+#define _mm_maskz_shldi_epi32(A, B, C, D) \
+ ((__m128i) \
+ __builtin_ia32_vpshld_v4si_mask ((__v4si)(__m128i)(B), \
+ (__v4si)(__m128i)(C),(int)(D), \
+ (__v4si)(__m128i)_mm_setzero_si128 (), \
+ (__mmask8)(A)))
+#define _mm_shldi_epi64(A, B, C) \
+ ((__m128i) __builtin_ia32_vpshld_v2di ((__v2di)(__m128i)(A), \
+ (__v2di)(__m128i)(B),(int)(C)))
+#define _mm_mask_shldi_epi64(A, B, C, D, E) \
+ ((__m128i) __builtin_ia32_vpshld_v2di_mask ((__v2di)(__m128i)(C), \
+ (__v2di)(__m128i)(D), (int)(E), \
+ (__v2di)(__m128i)(A), \
+ (__mmask8)(B)))
+#define _mm_maskz_shldi_epi64(A, B, C, D) \
+ ((__m128i) \
+ __builtin_ia32_vpshld_v2di_mask ((__v2di)(__m128i)(B), \
+ (__v2di)(__m128i)(C),(int)(D), \
+ (__v2di)(__m128i)_mm_setzero_si128 (), \
+ (__mmask8)(A)))
+#endif
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shrdv_epi16 (__m256i __A, __m256i __B, __m256i __C)
+{
+ return (__m256i) __builtin_ia32_vpshrdv_v16hi ((__v16hi)__A, (__v16hi) __B,
+ (__v16hi) __C);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shrdv_epi16 (__m256i __A, __mmask16 __B, __m256i __C, __m256i __D)
+{
+ return (__m256i)__builtin_ia32_vpshrdv_v16hi_mask ((__v16hi)__A,
+ (__v16hi) __C, (__v16hi) __D, (__mmask16)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shrdv_epi16 (__mmask16 __A, __m256i __B, __m256i __C, __m256i __D)
+{
+ return (__m256i)__builtin_ia32_vpshrdv_v16hi_maskz ((__v16hi)__B,
+ (__v16hi) __C, (__v16hi) __D, (__mmask16)__A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shrdv_epi32 (__m256i __A, __m256i __B, __m256i __C)
+{
+ return (__m256i) __builtin_ia32_vpshrdv_v8si ((__v8si)__A, (__v8si) __B,
+ (__v8si) __C);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shrdv_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D)
+{
+ return (__m256i)__builtin_ia32_vpshrdv_v8si_mask ((__v8si)__A, (__v8si) __C,
+ (__v8si) __D, (__mmask8)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shrdv_epi32 (__mmask8 __A, __m256i __B, __m256i __C, __m256i __D)
+{
+ return (__m256i)__builtin_ia32_vpshrdv_v8si_maskz ((__v8si)__B, (__v8si) __C,
+ (__v8si) __D, (__mmask8)__A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shrdv_epi64 (__m256i __A, __m256i __B, __m256i __C)
+{
+ return (__m256i) __builtin_ia32_vpshrdv_v4di ((__v4di)__A, (__v4di) __B,
+ (__v4di) __C);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shrdv_epi64 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D)
+{
+ return (__m256i)__builtin_ia32_vpshrdv_v4di_mask ((__v4di)__A, (__v4di) __C,
+ (__v4di) __D, (__mmask8)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shrdv_epi64 (__mmask8 __A, __m256i __B, __m256i __C, __m256i __D)
+{
+ return (__m256i)__builtin_ia32_vpshrdv_v4di_maskz ((__v4di)__B, (__v4di) __C,
+ (__v4di) __D, (__mmask8)__A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shrdv_epi16 (__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i) __builtin_ia32_vpshrdv_v8hi ((__v8hi)__A, (__v8hi) __B,
+ (__v8hi) __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shrdv_epi16 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D)
+{
+ return (__m128i)__builtin_ia32_vpshrdv_v8hi_mask ((__v8hi)__A, (__v8hi) __C,
+ (__v8hi) __D, (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shrdv_epi16 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D)
+{
+ return (__m128i)__builtin_ia32_vpshrdv_v8hi_maskz ((__v8hi)__B, (__v8hi) __C,
+ (__v8hi) __D, (__mmask8)__A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shrdv_epi32 (__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i) __builtin_ia32_vpshrdv_v4si ((__v4si)__A, (__v4si) __B,
+ (__v4si) __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shrdv_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D)
+{
+ return (__m128i)__builtin_ia32_vpshrdv_v4si_mask ((__v4si)__A, (__v4si) __C,
+ (__v4si) __D, (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shrdv_epi32 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D)
+{
+ return (__m128i)__builtin_ia32_vpshrdv_v4si_maskz ((__v4si)__B, (__v4si) __C,
+ (__v4si) __D, (__mmask8)__A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shrdv_epi64 (__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i) __builtin_ia32_vpshrdv_v2di ((__v2di)__A, (__v2di) __B,
+ (__v2di) __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shrdv_epi64 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D)
+{
+ return (__m128i)__builtin_ia32_vpshrdv_v2di_mask ((__v2di)__A, (__v2di) __C,
+ (__v2di) __D, (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shrdv_epi64 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D)
+{
+ return (__m128i)__builtin_ia32_vpshrdv_v2di_maskz ((__v2di)__B, (__v2di) __C,
+ (__v2di) __D, (__mmask8)__A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shldv_epi16 (__m256i __A, __m256i __B, __m256i __C)
+{
+ return (__m256i) __builtin_ia32_vpshldv_v16hi ((__v16hi)__A, (__v16hi) __B,
+ (__v16hi) __C);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shldv_epi16 (__m256i __A, __mmask16 __B, __m256i __C, __m256i __D)
+{
+ return (__m256i)__builtin_ia32_vpshldv_v16hi_mask ((__v16hi)__A,
+ (__v16hi) __C, (__v16hi) __D, (__mmask16)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shldv_epi16 (__mmask16 __A, __m256i __B, __m256i __C, __m256i __D)
+{
+ return (__m256i)__builtin_ia32_vpshldv_v16hi_maskz ((__v16hi)__B,
+ (__v16hi) __C, (__v16hi) __D, (__mmask16)__A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shldv_epi32 (__m256i __A, __m256i __B, __m256i __C)
+{
+ return (__m256i) __builtin_ia32_vpshldv_v8si ((__v8si)__A, (__v8si) __B,
+ (__v8si) __C);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shldv_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D)
+{
+ return (__m256i)__builtin_ia32_vpshldv_v8si_mask ((__v8si)__A, (__v8si) __C,
+ (__v8si) __D, (__mmask8)__B) ;
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shldv_epi32 (__mmask8 __A, __m256i __B, __m256i __C, __m256i __D)
+{
+ return (__m256i)__builtin_ia32_vpshldv_v8si_maskz ((__v8si)__B, (__v8si) __C,
+ (__v8si) __D, (__mmask8)__A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shldv_epi64 (__m256i __A, __m256i __B, __m256i __C)
+{
+ return (__m256i) __builtin_ia32_vpshldv_v4di ((__v4di)__A, (__v4di) __B,
+ (__v4di) __C);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shldv_epi64 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D)
+{
+ return (__m256i)__builtin_ia32_vpshldv_v4di_mask ((__v4di)__A, (__v4di) __C,
+ (__v4di) __D, (__mmask8)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shldv_epi64 (__mmask8 __A, __m256i __B, __m256i __C, __m256i __D)
+{
+ return (__m256i)__builtin_ia32_vpshldv_v4di_maskz ((__v4di)__B, (__v4di) __C,
+ (__v4di) __D, (__mmask8)__A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shldv_epi16 (__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i) __builtin_ia32_vpshldv_v8hi ((__v8hi)__A, (__v8hi) __B,
+ (__v8hi) __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shldv_epi16 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D)
+{
+ return (__m128i)__builtin_ia32_vpshldv_v8hi_mask ((__v8hi)__A, (__v8hi) __C,
+ (__v8hi) __D, (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shldv_epi16 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D)
+{
+ return (__m128i)__builtin_ia32_vpshldv_v8hi_maskz ((__v8hi)__B, (__v8hi) __C,
+ (__v8hi) __D, (__mmask8)__A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shldv_epi32 (__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i) __builtin_ia32_vpshldv_v4si ((__v4si)__A, (__v4si) __B,
+ (__v4si) __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shldv_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D)
+{
+ return (__m128i)__builtin_ia32_vpshldv_v4si_mask ((__v4si)__A, (__v4si) __C,
+ (__v4si) __D, (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shldv_epi32 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D)
+{
+ return (__m128i)__builtin_ia32_vpshldv_v4si_maskz ((__v4si)__B, (__v4si) __C,
+ (__v4si) __D, (__mmask8)__A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shldv_epi64 (__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i) __builtin_ia32_vpshldv_v2di ((__v2di)__A, (__v2di) __B,
+ (__v2di) __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shldv_epi64 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D)
+{
+ return (__m128i)__builtin_ia32_vpshldv_v2di_mask ((__v2di)__A, (__v2di) __C,
+ (__v2di) __D, (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shldv_epi64 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D)
+{
+ return (__m128i)__builtin_ia32_vpshldv_v2di_maskz ((__v2di)__B, (__v2di) __C,
+ (__v2di) __D, (__mmask8)__A);
+}
+
+
+
+
+#ifdef __DISABLE_AVX512VBMI2VL__
+#undef __DISABLE_AVX512VBMI2VL__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512VBMIVL__ */
+
+#if !defined(__AVX512VL__) || !defined(__AVX512VBMI2__) || \
+ !defined(__AVX512BW__)
+#pragma GCC push_options
+#pragma GCC target("avx512vbmi2,avx512vl,avx512bw")
+#define __DISABLE_AVX512VBMI2VLBW__
+#endif /* __AVX512VBMIVLBW__ */
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_compress_epi8 (__m256i __A, __mmask32 __B, __m256i __C)
+{
+ return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi)__C,
+ (__v32qi)__A, (__mmask32)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_compress_epi8 (__mmask32 __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __B,
+ (__v32qi) _mm256_setzero_si256 (), (__mmask32) __A);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_compressstoreu_epi8 (void * __A, __mmask32 __B, __m256i __C)
+{
+ __builtin_ia32_compressstoreuqi256_mask ((__v32qi *) __A, (__v32qi) __C,
+ (__mmask32) __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_expand_epi8 (__m256i __A, __mmask32 __B, __m256i __C)
+{
+ return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __C,
+ (__v32qi) __A,
+ (__mmask32) __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_expand_epi8 (__mmask32 __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_expandqi256_maskz ((__v32qi) __B,
+ (__v32qi) _mm256_setzero_si256 (), (__mmask32) __A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_expandloadu_epi8 (__m256i __A, __mmask32 __B, const void * __C)
+{
+ return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *) __C,
+ (__v32qi) __A, (__mmask32) __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_expandloadu_epi8 (__mmask32 __A, const void * __B)
+{
+ return (__m256i) __builtin_ia32_expandloadqi256_maskz ((const __v32qi *) __B,
+ (__v32qi) _mm256_setzero_si256 (), (__mmask32) __A);
+}
+
+#ifdef __DISABLE_AVX512VBMI2VLBW__
+#undef __DISABLE_AVX512VBMI2VLBW__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512VBMIVLBW__ */
+
+#endif /* _AVX512VBMIVLINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2013-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512vbmiintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512VBMIINTRIN_H_INCLUDED
+#define _AVX512VBMIINTRIN_H_INCLUDED
+
+#ifndef __AVX512VBMI__
+#pragma GCC push_options
+#pragma GCC target("avx512vbmi")
+#define __DISABLE_AVX512VBMI__
+#endif /* __AVX512VBMI__ */
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_multishift_epi64_epi8 (__m512i __W, __mmask64 __M, __m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X,
+ (__v64qi) __Y,
+ (__v64qi) __W,
+ (__mmask64) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_multishift_epi64_epi8 (__mmask64 __M, __m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X,
+ (__v64qi) __Y,
+ (__v64qi)
+ _mm512_setzero_si512 (),
+ (__mmask64) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_multishift_epi64_epi8 (__m512i __X, __m512i __Y)
+{
+ return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X,
+ (__v64qi) __Y,
+ (__v64qi)
+ _mm512_undefined_epi32 (),
+ (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutexvar_epi8 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
+ (__v64qi) __A,
+ (__v64qi)
+ _mm512_undefined_epi32 (),
+ (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
+ (__v64qi) __A,
+ (__v64qi)
+ _mm512_setzero_si512(),
+ (__mmask64) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
+ __m512i __B)
+{
+ return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
+ (__v64qi) __A,
+ (__v64qi) __W,
+ (__mmask64) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutex2var_epi8 (__m512i __A, __m512i __I, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_vpermt2varqi512_mask ((__v64qi) __I
+ /* idx */ ,
+ (__v64qi) __A,
+ (__v64qi) __B,
+ (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutex2var_epi8 (__m512i __A, __mmask64 __U,
+ __m512i __I, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_vpermt2varqi512_mask ((__v64qi) __I
+ /* idx */ ,
+ (__v64qi) __A,
+ (__v64qi) __B,
+ (__mmask64)
+ __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask2_permutex2var_epi8 (__m512i __A, __m512i __I,
+ __mmask64 __U, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_vpermi2varqi512_mask ((__v64qi) __A,
+ (__v64qi) __I
+ /* idx */ ,
+ (__v64qi) __B,
+ (__mmask64)
+ __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutex2var_epi8 (__mmask64 __U, __m512i __A,
+ __m512i __I, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_vpermt2varqi512_maskz ((__v64qi) __I
+ /* idx */ ,
+ (__v64qi) __A,
+ (__v64qi) __B,
+ (__mmask64)
+ __U);
+}
+
+#ifdef __DISABLE_AVX512VBMI__
+#undef __DISABLE_AVX512VBMI__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512VBMI__ */
+
+#endif /* _AVX512VBMIINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2013-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512vbmivlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512VBMIVLINTRIN_H_INCLUDED
+#define _AVX512VBMIVLINTRIN_H_INCLUDED
+
+#if !defined(__AVX512VL__) || !defined(__AVX512VBMI__)
+#pragma GCC push_options
+#pragma GCC target("avx512vbmi,avx512vl")
+#define __DISABLE_AVX512VBMIVL__
+#endif /* __AVX512VBMIVL__ */
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_multishift_epi64_epi8 (__m256i __W, __mmask32 __M, __m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
+ (__v32qi) __Y,
+ (__v32qi) __W,
+ (__mmask32) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_multishift_epi64_epi8 (__mmask32 __M, __m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
+ (__v32qi) __Y,
+ (__v32qi)
+ _mm256_setzero_si256 (),
+ (__mmask32) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_multishift_epi64_epi8 (__m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
+ (__v32qi) __Y,
+ (__v32qi)
+ _mm256_undefined_si256 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_multishift_epi64_epi8 (__m128i __W, __mmask16 __M, __m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
+ (__v16qi) __Y,
+ (__v16qi) __W,
+ (__mmask16) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_multishift_epi64_epi8 (__mmask16 __M, __m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
+ (__v16qi) __Y,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ (__mmask16) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_multishift_epi64_epi8 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
+ (__v16qi) __Y,
+ (__v16qi)
+ _mm_undefined_si128 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutexvar_epi8 (__m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
+ (__v32qi) __A,
+ (__v32qi)
+ _mm256_undefined_si256 (),
+ (__mmask32) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
+ (__v32qi) __A,
+ (__v32qi)
+ _mm256_setzero_si256 (),
+ (__mmask32) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
+ (__v32qi) __A,
+ (__v32qi) __W,
+ (__mmask32) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutexvar_epi8 (__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
+ (__v16qi) __A,
+ (__v16qi)
+ _mm_undefined_si128 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
+ (__v16qi) __A,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ (__mmask16) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
+ (__v16qi) __A,
+ (__v16qi) __W,
+ (__mmask16) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutex2var_epi8 (__m256i __A, __m256i __I, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_vpermt2varqi256_mask ((__v32qi) __I
+ /* idx */ ,
+ (__v32qi) __A,
+ (__v32qi) __B,
+ (__mmask32) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutex2var_epi8 (__m256i __A, __mmask32 __U,
+ __m256i __I, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_vpermt2varqi256_mask ((__v32qi) __I
+ /* idx */ ,
+ (__v32qi) __A,
+ (__v32qi) __B,
+ (__mmask32)
+ __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask2_permutex2var_epi8 (__m256i __A, __m256i __I,
+ __mmask32 __U, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_vpermi2varqi256_mask ((__v32qi) __A,
+ (__v32qi) __I
+ /* idx */ ,
+ (__v32qi) __B,
+ (__mmask32)
+ __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutex2var_epi8 (__mmask32 __U, __m256i __A,
+ __m256i __I, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_vpermt2varqi256_maskz ((__v32qi) __I
+ /* idx */ ,
+ (__v32qi) __A,
+ (__v32qi) __B,
+ (__mmask32)
+ __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutex2var_epi8 (__m128i __A, __m128i __I, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpermt2varqi128_mask ((__v16qi) __I
+ /* idx */ ,
+ (__v16qi) __A,
+ (__v16qi) __B,
+ (__mmask16) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_permutex2var_epi8 (__m128i __A, __mmask16 __U, __m128i __I,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpermt2varqi128_mask ((__v16qi) __I
+ /* idx */ ,
+ (__v16qi) __A,
+ (__v16qi) __B,
+ (__mmask16)
+ __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask2_permutex2var_epi8 (__m128i __A, __m128i __I, __mmask16 __U,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpermi2varqi128_mask ((__v16qi) __A,
+ (__v16qi) __I
+ /* idx */ ,
+ (__v16qi) __B,
+ (__mmask16)
+ __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_permutex2var_epi8 (__mmask16 __U, __m128i __A, __m128i __I,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpermt2varqi128_maskz ((__v16qi) __I
+ /* idx */ ,
+ (__v16qi) __A,
+ (__v16qi) __B,
+ (__mmask16)
+ __U);
+}
+
+#ifdef __DISABLE_AVX512VBMIVL__
+#undef __DISABLE_AVX512VBMIVL__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512VBMIVL__ */
+
+#endif /* _AVX512VBMIVLINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2014-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512vlbwintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512VLBWINTRIN_H_INCLUDED
+#define _AVX512VLBWINTRIN_H_INCLUDED
+
+#if !defined(__AVX512VL__) || !defined(__AVX512BW__)
+#pragma GCC push_options
+#pragma GCC target("avx512vl,avx512bw")
+#define __DISABLE_AVX512VLBW__
+#endif /* __AVX512VLBW__ */
+
+/* Internal data types for implementing the intrinsics. */
+typedef short __v16hi_u __attribute__ ((__vector_size__ (32), \
+ __may_alias__, __aligned__ (1)));
+typedef short __v8hi_u __attribute__ ((__vector_size__ (16), \
+ __may_alias__, __aligned__ (1)));
+typedef char __v32qi_u __attribute__ ((__vector_size__ (32), \
+ __may_alias__, __aligned__ (1)));
+typedef char __v16qi_u __attribute__ ((__vector_size__ (16), \
+ __may_alias__, __aligned__ (1)));
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mov_epi8 (__m256i __W, __mmask32 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_movdquqi256_mask ((__v32qi) __A,
+ (__v32qi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mov_epi8 (__mmask32 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_movdquqi256_mask ((__v32qi) __A,
+ (__v32qi)
+ _mm256_setzero_si256 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mov_epi8 (__m128i __W, __mmask16 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_movdquqi128_mask ((__v16qi) __A,
+ (__v16qi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mov_epi8 (__mmask16 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_movdquqi128_mask ((__v16qi) __A,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ (__mmask16) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu_epi8 (void *__P, __m256i __A)
+{
+ *(__v32qi_u *) __P = (__v32qi_u) __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_storeu_epi8 (void *__P, __mmask32 __U, __m256i __A)
+{
+ __builtin_ia32_storedquqi256_mask ((char *) __P,
+ (__v32qi) __A,
+ (__mmask32) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_epi8 (void *__P, __m128i __A)
+{
+ *(__v16qi_u *) __P = (__v16qi_u) __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_storeu_epi8 (void *__P, __mmask16 __U, __m128i __A)
+{
+ __builtin_ia32_storedquqi128_mask ((char *) __P,
+ (__v16qi) __A,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu_epi16 (void const *__P)
+{
+ return (__m256i) (*(__v16hi_u *) __P);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_loadu_epi16 (__m256i __W, __mmask16 __U, void const *__P)
+{
+ return (__m256i) __builtin_ia32_loaddquhi256_mask ((const short *) __P,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_loadu_epi16 (__mmask16 __U, void const *__P)
+{
+ return (__m256i) __builtin_ia32_loaddquhi256_mask ((const short *) __P,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_epi16 (void const *__P)
+{
+ return (__m128i) (*(__v8hi_u *) __P);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_loadu_epi16 (__m128i __W, __mmask8 __U, void const *__P)
+{
+ return (__m128i) __builtin_ia32_loaddquhi128_mask ((const short *) __P,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_loadu_epi16 (__mmask8 __U, void const *__P)
+{
+ return (__m128i) __builtin_ia32_loaddquhi128_mask ((const short *) __P,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mov_epi16 (__m256i __W, __mmask16 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_movdquhi256_mask ((__v16hi) __A,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mov_epi16 (__mmask16 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_movdquhi256_mask ((__v16hi) __A,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mov_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_movdquhi128_mask ((__v8hi) __A,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mov_epi16 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_movdquhi128_mask ((__v8hi) __A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu_epi8 (void const *__P)
+{
+ return (__m256i) (*(__v32qi_u *) __P);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_loadu_epi8 (__m256i __W, __mmask32 __U, void const *__P)
+{
+ return (__m256i) __builtin_ia32_loaddquqi256_mask ((const char *) __P,
+ (__v32qi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_loadu_epi8 (__mmask32 __U, void const *__P)
+{
+ return (__m256i) __builtin_ia32_loaddquqi256_mask ((const char *) __P,
+ (__v32qi)
+ _mm256_setzero_si256 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_epi8 (void const *__P)
+{
+ return (__m128i) (*(__v16qi_u *) __P);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_loadu_epi8 (__m128i __W, __mmask16 __U, void const *__P)
+{
+ return (__m128i) __builtin_ia32_loaddquqi128_mask ((const char *) __P,
+ (__v16qi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_loadu_epi8 (__mmask16 __U, void const *__P)
+{
+ return (__m128i) __builtin_ia32_loaddquqi128_mask ((const char *) __P,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi16_epi8 (__m256i __A)
+{
+
+ return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A,
+ (__v16qi)_mm_undefined_si128(),
+ (__mmask16) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi16_storeu_epi8 (void * __P, __mmask16 __M,__m256i __A)
+{
+ __builtin_ia32_pmovwb256mem_mask ((__v16qi *) __P , (__v16hi) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A,
+ (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi16_epi8 (__mmask16 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsepi16_epi8 (__m128i __A)
+{
+
+ return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
+ (__v16qi)_mm_undefined_si128(),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask8 __M,__m128i __A)
+{
+ __builtin_ia32_pmovswb128mem_mask ((unsigned long long *) __P , (__v8hi) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
+ (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtsepi16_epi8 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtsepi16_epi8 (__m256i __A)
+{
+
+ return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
+ (__v16qi)_mm_undefined_si128(),
+ (__mmask16) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask16 __M,__m256i __A)
+{
+ __builtin_ia32_pmovswb256mem_mask ((__v16qi *) __P , (__v16hi) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtsepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
+ (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtsepi16_epi8 (__mmask16 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtusepi16_epi8 (__m128i __A)
+{
+
+ return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
+ (__v16qi)_mm_undefined_si128(),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask8 __M,__m128i __A)
+{
+ __builtin_ia32_pmovuswb128mem_mask ((unsigned long long *) __P , (__v8hi) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtusepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
+ (__v16qi) __O,
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtusepi16_epi8 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtusepi16_epi8 (__m256i __A)
+{
+
+ return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
+ (__v16qi)_mm_undefined_si128(),
+ (__mmask16) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask16 __M,__m256i __A)
+{
+ __builtin_ia32_pmovuswb256mem_mask ((__v16qi *) __P , (__v16hi) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtusepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
+ (__v16qi) __O,
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtusepi16_epi8 (__mmask16 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_broadcastb_epi8 (__m256i __O, __mmask32 __M, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_pbroadcastb256_mask ((__v16qi) __A,
+ (__v32qi) __O,
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_broadcastb_epi8 (__mmask32 __M, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_pbroadcastb256_mask ((__v16qi) __A,
+ (__v32qi)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_set1_epi8 (__m256i __O, __mmask32 __M, char __A)
+{
+ return (__m256i) __builtin_ia32_pbroadcastb256_gpr_mask (__A,
+ (__v32qi) __O,
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_set1_epi8 (__mmask32 __M, char __A)
+{
+ return (__m256i) __builtin_ia32_pbroadcastb256_gpr_mask (__A,
+ (__v32qi)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_broadcastb_epi8 (__m128i __O, __mmask16 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pbroadcastb128_mask ((__v16qi) __A,
+ (__v16qi) __O,
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_broadcastb_epi8 (__mmask16 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pbroadcastb128_mask ((__v16qi) __A,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A)
+{
+ return (__m128i) __builtin_ia32_pbroadcastb128_gpr_mask (__A,
+ (__v16qi) __O,
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_set1_epi8 (__mmask16 __M, char __A)
+{
+ return (__m128i) __builtin_ia32_pbroadcastb128_gpr_mask (__A,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_broadcastw_epi16 (__m256i __O, __mmask16 __M, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_pbroadcastw256_mask ((__v8hi) __A,
+ (__v16hi) __O,
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_broadcastw_epi16 (__mmask16 __M, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_pbroadcastw256_mask ((__v8hi) __A,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_set1_epi16 (__m256i __O, __mmask16 __M, short __A)
+{
+ return (__m256i) __builtin_ia32_pbroadcastw256_gpr_mask (__A,
+ (__v16hi) __O,
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_set1_epi16 (__mmask16 __M, short __A)
+{
+ return (__m256i) __builtin_ia32_pbroadcastw256_gpr_mask (__A,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_broadcastw_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pbroadcastw128_mask ((__v8hi) __A,
+ (__v8hi) __O,
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_broadcastw_epi16 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pbroadcastw128_mask ((__v8hi) __A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_set1_epi16 (__m128i __O, __mmask8 __M, short __A)
+{
+ return (__m128i) __builtin_ia32_pbroadcastw128_gpr_mask (__A,
+ (__v8hi) __O,
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_set1_epi16 (__mmask8 __M, short __A)
+{
+ return (__m128i) __builtin_ia32_pbroadcastw128_gpr_mask (__A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutexvar_epi16 (__m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B,
+ (__v16hi) __A,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutexvar_epi16 (__mmask16 __M, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B,
+ (__v16hi) __A,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B,
+ (__v16hi) __A,
+ (__v16hi) __W,
+ (__mmask16) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutexvar_epi16 (__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B,
+ (__v8hi) __A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_permutexvar_epi16 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B,
+ (__v8hi) __A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_permutexvar_epi16 (__m128i __W, __mmask8 __M, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B,
+ (__v8hi) __A,
+ (__v8hi) __W,
+ (__mmask8) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutex2var_epi16 (__m256i __A, __m256i __I, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_vpermt2varhi256_mask ((__v16hi) __I
+ /* idx */ ,
+ (__v16hi) __A,
+ (__v16hi) __B,
+ (__mmask16) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutex2var_epi16 (__m256i __A, __mmask16 __U,
+ __m256i __I, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_vpermt2varhi256_mask ((__v16hi) __I
+ /* idx */ ,
+ (__v16hi) __A,
+ (__v16hi) __B,
+ (__mmask16)
+ __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask2_permutex2var_epi16 (__m256i __A, __m256i __I,
+ __mmask16 __U, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_vpermi2varhi256_mask ((__v16hi) __A,
+ (__v16hi) __I
+ /* idx */ ,
+ (__v16hi) __B,
+ (__mmask16)
+ __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A,
+ __m256i __I, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_vpermt2varhi256_maskz ((__v16hi) __I
+ /* idx */ ,
+ (__v16hi) __A,
+ (__v16hi) __B,
+ (__mmask16)
+ __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutex2var_epi16 (__m128i __A, __m128i __I, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpermt2varhi128_mask ((__v8hi) __I
+ /* idx */ ,
+ (__v8hi) __A,
+ (__v8hi) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_permutex2var_epi16 (__m128i __A, __mmask8 __U, __m128i __I,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpermt2varhi128_mask ((__v8hi) __I
+ /* idx */ ,
+ (__v8hi) __A,
+ (__v8hi) __B,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask2_permutex2var_epi16 (__m128i __A, __m128i __I, __mmask8 __U,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpermi2varhi128_mask ((__v8hi) __A,
+ (__v8hi) __I
+ /* idx */ ,
+ (__v8hi) __B,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_permutex2var_epi16 (__mmask8 __U, __m128i __A, __m128i __I,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpermt2varhi128_maskz ((__v8hi) __I
+ /* idx */ ,
+ (__v8hi) __A,
+ (__v8hi) __B,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_maddubs_epi16 (__m256i __W, __mmask16 __U, __m256i __X,
+ __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_pmaddubsw256_mask ((__v32qi) __X,
+ (__v32qi) __Y,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_maddubs_epi16 (__mmask16 __U, __m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_pmaddubsw256_mask ((__v32qi) __X,
+ (__v32qi) __Y,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_maddubs_epi16 (__m128i __W, __mmask8 __U, __m128i __X,
+ __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_pmaddubsw128_mask ((__v16qi) __X,
+ (__v16qi) __Y,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_maddubs_epi16 (__mmask8 __U, __m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_pmaddubsw128_mask ((__v16qi) __X,
+ (__v16qi) __Y,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_madd_epi16 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pmaddwd256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_madd_epi16 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pmaddwd256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_madd_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pmaddwd128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_madd_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pmaddwd128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movepi8_mask (__m128i __A)
+{
+ return (__mmask16) __builtin_ia32_cvtb2mask128 ((__v16qi) __A);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movepi8_mask (__m256i __A)
+{
+ return (__mmask32) __builtin_ia32_cvtb2mask256 ((__v32qi) __A);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movepi16_mask (__m128i __A)
+{
+ return (__mmask8) __builtin_ia32_cvtw2mask128 ((__v8hi) __A);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movepi16_mask (__m256i __A)
+{
+ return (__mmask16) __builtin_ia32_cvtw2mask256 ((__v16hi) __A);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movm_epi8 (__mmask16 __A)
+{
+ return (__m128i) __builtin_ia32_cvtmask2b128 (__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movm_epi8 (__mmask32 __A)
+{
+ return (__m256i) __builtin_ia32_cvtmask2b256 (__A);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movm_epi16 (__mmask8 __A)
+{
+ return (__m128i) __builtin_ia32_cvtmask2w128 (__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movm_epi16 (__mmask16 __A)
+{
+ return (__m256i) __builtin_ia32_cvtmask2w256 (__A);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_test_epi8_mask (__m128i __A, __m128i __B)
+{
+ return (__mmask16) __builtin_ia32_ptestmb128 ((__v16qi) __A,
+ (__v16qi) __B,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_test_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B)
+{
+ return (__mmask16) __builtin_ia32_ptestmb128 ((__v16qi) __A,
+ (__v16qi) __B, __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_test_epi8_mask (__m256i __A, __m256i __B)
+{
+ return (__mmask32) __builtin_ia32_ptestmb256 ((__v32qi) __A,
+ (__v32qi) __B,
+ (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_test_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B)
+{
+ return (__mmask32) __builtin_ia32_ptestmb256 ((__v32qi) __A,
+ (__v32qi) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_test_epi16_mask (__m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_ptestmw128 ((__v8hi) __A,
+ (__v8hi) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_test_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_ptestmw128 ((__v8hi) __A,
+ (__v8hi) __B, __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_test_epi16_mask (__m256i __A, __m256i __B)
+{
+ return (__mmask16) __builtin_ia32_ptestmw256 ((__v16hi) __A,
+ (__v16hi) __B,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_test_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__mmask16) __builtin_ia32_ptestmw256 ((__v16hi) __A,
+ (__v16hi) __B, __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_min_epu16 (__mmask16 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pminuw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_min_epu16 (__m256i __W, __mmask16 __M, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pminuw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi) __W,
+ (__mmask16) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_epu16 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pminuw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_epu16 (__m128i __W, __mmask8 __M, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pminuw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi) __W,
+ (__mmask8) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_min_epi16 (__mmask16 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pminsw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_min_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pminsw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi) __W,
+ (__mmask16) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_max_epu8 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pmaxub256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__v32qi)
+ _mm256_setzero_si256 (),
+ (__mmask32) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_max_epu8 (__m256i __W, __mmask32 __M, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pmaxub256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__v32qi) __W,
+ (__mmask32) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_epu8 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pmaxub128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ (__mmask16) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_epu8 (__m128i __W, __mmask16 __M, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pmaxub128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__v16qi) __W,
+ (__mmask16) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_max_epi8 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pmaxsb256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__v32qi)
+ _mm256_setzero_si256 (),
+ (__mmask32) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_max_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pmaxsb256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__v32qi) __W,
+ (__mmask32) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pmaxsb128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ (__mmask16) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pmaxsb128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__v16qi) __W,
+ (__mmask16) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_min_epu8 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pminub256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__v32qi)
+ _mm256_setzero_si256 (),
+ (__mmask32) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_min_epu8 (__m256i __W, __mmask32 __M, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pminub256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__v32qi) __W,
+ (__mmask32) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_epu8 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pminub128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ (__mmask16) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_epu8 (__m128i __W, __mmask16 __M, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pminub128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__v16qi) __W,
+ (__mmask16) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_min_epi8 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pminsb256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__v32qi)
+ _mm256_setzero_si256 (),
+ (__mmask32) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_min_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pminsb256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__v32qi) __W,
+ (__mmask32) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pminsb128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ (__mmask16) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pminsb128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__v16qi) __W,
+ (__mmask16) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_max_epi16 (__mmask16 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pmaxsw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_max_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pmaxsw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi) __W,
+ (__mmask16) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_epi16 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pmaxsw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_epi16 (__m128i __W, __mmask8 __M, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pmaxsw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi) __W,
+ (__mmask8) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_max_epu16 (__mmask16 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pmaxuw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_max_epu16 (__m256i __W, __mmask16 __M, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pmaxuw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi) __W,
+ (__mmask16) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_epu16 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pmaxuw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_epu16 (__m128i __W, __mmask8 __M, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pmaxuw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi) __W,
+ (__mmask8) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_epi16 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pminsw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_epi16 (__m128i __W, __mmask8 __M, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pminsw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi) __W,
+ (__mmask8) __M);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_alignr_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
+ __m256i __B, const int __N)
+{
+ return (__m256i) __builtin_ia32_palignr256_mask ((__v4di) __A,
+ (__v4di) __B,
+ __N * 8,
+ (__v4di) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_alignr_epi8 (__mmask32 __U, __m256i __A, __m256i __B,
+ const int __N)
+{
+ return (__m256i) __builtin_ia32_palignr256_mask ((__v4di) __A,
+ (__v4di) __B,
+ __N * 8,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_alignr_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
+ __m128i __B, const int __N)
+{
+ return (__m128i) __builtin_ia32_palignr128_mask ((__v2di) __A,
+ (__v2di) __B,
+ __N * 8,
+ (__v2di) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_alignr_epi8 (__mmask16 __U, __m128i __A, __m128i __B,
+ const int __N)
+{
+ return (__m128i) __builtin_ia32_palignr128_mask ((__v2di) __A,
+ (__v2di) __B,
+ __N * 8,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_dbsad_epu8 (__m256i __A, __m256i __B, const int __imm)
+{
+ return (__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ __imm,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_dbsad_epu8 (__m256i __W, __mmask16 __U, __m256i __A,
+ __m256i __B, const int __imm)
+{
+ return (__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ __imm,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_dbsad_epu8 (__mmask16 __U, __m256i __A, __m256i __B,
+ const int __imm)
+{
+ return (__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ __imm,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_dbsad_epu8 (__m128i __A, __m128i __B, const int __imm)
+{
+ return (__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ __imm,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_dbsad_epu8 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B, const int __imm)
+{
+ return (__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ __imm,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_dbsad_epu8 (__mmask8 __U, __m128i __A, __m128i __B,
+ const int __imm)
+{
+ return (__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ __imm,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_blend_epi16 (__mmask8 __U, __m128i __A, __m128i __W)
+{
+ return (__m128i) __builtin_ia32_blendmw_128_mask ((__v8hi) __A,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_blend_epi8 (__mmask16 __U, __m128i __A, __m128i __W)
+{
+ return (__m128i) __builtin_ia32_blendmb_128_mask ((__v16qi) __A,
+ (__v16qi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_blend_epi16 (__mmask16 __U, __m256i __A, __m256i __W)
+{
+ return (__m256i) __builtin_ia32_blendmw_256_mask ((__v16hi) __A,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_blend_epi8 (__mmask32 __U, __m256i __A, __m256i __W)
+{
+ return (__m256i) __builtin_ia32_blendmb_256_mask ((__v32qi) __A,
+ (__v32qi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_epi16_mask (__mmask8 __U, __m128i __X, __m128i __Y,
+ const int __P)
+{
+ return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
+ (__v8hi) __Y, __P,
+ (__mmask8) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_epi16_mask (__m128i __X, __m128i __Y, const int __P)
+{
+ return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
+ (__v8hi) __Y, __P,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmp_epi16_mask (__mmask16 __U, __m256i __X, __m256i __Y,
+ const int __P)
+{
+ return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
+ (__v16hi) __Y, __P,
+ (__mmask16) __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_epi16_mask (__m256i __X, __m256i __Y, const int __P)
+{
+ return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
+ (__v16hi) __Y, __P,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_epi8_mask (__mmask16 __U, __m128i __X, __m128i __Y,
+ const int __P)
+{
+ return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
+ (__v16qi) __Y, __P,
+ (__mmask16) __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_epi8_mask (__m128i __X, __m128i __Y, const int __P)
+{
+ return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
+ (__v16qi) __Y, __P,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmp_epi8_mask (__mmask32 __U, __m256i __X, __m256i __Y,
+ const int __P)
+{
+ return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
+ (__v32qi) __Y, __P,
+ (__mmask32) __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_epi8_mask (__m256i __X, __m256i __Y, const int __P)
+{
+ return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
+ (__v32qi) __Y, __P,
+ (__mmask32) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_epu16_mask (__mmask8 __U, __m128i __X, __m128i __Y,
+ const int __P)
+{
+ return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
+ (__v8hi) __Y, __P,
+ (__mmask8) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_epu16_mask (__m128i __X, __m128i __Y, const int __P)
+{
+ return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
+ (__v8hi) __Y, __P,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmp_epu16_mask (__mmask16 __U, __m256i __X, __m256i __Y,
+ const int __P)
+{
+ return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
+ (__v16hi) __Y, __P,
+ (__mmask16) __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_epu16_mask (__m256i __X, __m256i __Y, const int __P)
+{
+ return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
+ (__v16hi) __Y, __P,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_epu8_mask (__mmask16 __U, __m128i __X, __m128i __Y,
+ const int __P)
+{
+ return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
+ (__v16qi) __Y, __P,
+ (__mmask16) __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_epu8_mask (__m128i __X, __m128i __Y, const int __P)
+{
+ return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
+ (__v16qi) __Y, __P,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmp_epu8_mask (__mmask32 __U, __m256i __X, __m256i __Y,
+ const int __P)
+{
+ return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
+ (__v32qi) __Y, __P,
+ (__mmask32) __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_epu8_mask (__m256i __X, __m256i __Y, const int __P)
+{
+ return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
+ (__v32qi) __Y, __P,
+ (__mmask32) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_srli_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+ const int __imm)
+{
+ return (__m256i) __builtin_ia32_psrlwi256_mask ((__v16hi) __A, __imm,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_srli_epi16 (__mmask16 __U, __m256i __A, const int __imm)
+{
+ return (__m256i) __builtin_ia32_psrlwi256_mask ((__v16hi) __A, __imm,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_srli_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+ const int __imm)
+{
+ return (__m128i) __builtin_ia32_psrlwi128_mask ((__v8hi) __A, __imm,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_srli_epi16 (__mmask8 __U, __m128i __A, const int __imm)
+{
+ return (__m128i) __builtin_ia32_psrlwi128_mask ((__v8hi) __A, __imm,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shufflehi_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+ const int __imm)
+{
+ return (__m256i) __builtin_ia32_pshufhw256_mask ((__v16hi) __A,
+ __imm,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shufflehi_epi16 (__mmask16 __U, __m256i __A,
+ const int __imm)
+{
+ return (__m256i) __builtin_ia32_pshufhw256_mask ((__v16hi) __A,
+ __imm,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shufflehi_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+ const int __imm)
+{
+ return (__m128i) __builtin_ia32_pshufhw128_mask ((__v8hi) __A, __imm,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shufflehi_epi16 (__mmask8 __U, __m128i __A, const int __imm)
+{
+ return (__m128i) __builtin_ia32_pshufhw128_mask ((__v8hi) __A, __imm,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shufflelo_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+ const int __imm)
+{
+ return (__m256i) __builtin_ia32_pshuflw256_mask ((__v16hi) __A,
+ __imm,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shufflelo_epi16 (__mmask16 __U, __m256i __A,
+ const int __imm)
+{
+ return (__m256i) __builtin_ia32_pshuflw256_mask ((__v16hi) __A,
+ __imm,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shufflelo_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+ const int __imm)
+{
+ return (__m128i) __builtin_ia32_pshuflw128_mask ((__v8hi) __A, __imm,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shufflelo_epi16 (__mmask8 __U, __m128i __A, const int __imm)
+{
+ return (__m128i) __builtin_ia32_pshuflw128_mask ((__v8hi) __A, __imm,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_srai_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+ const int __imm)
+{
+ return (__m256i) __builtin_ia32_psrawi256_mask ((__v16hi) __A, __imm,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_srai_epi16 (__mmask16 __U, __m256i __A, const int __imm)
+{
+ return (__m256i) __builtin_ia32_psrawi256_mask ((__v16hi) __A, __imm,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_srai_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+ const int __imm)
+{
+ return (__m128i) __builtin_ia32_psrawi128_mask ((__v8hi) __A, __imm,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_srai_epi16 (__mmask8 __U, __m128i __A, const int __imm)
+{
+ return (__m128i) __builtin_ia32_psrawi128_mask ((__v8hi) __A, __imm,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_slli_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+ int __B)
+{
+ return (__m256i) __builtin_ia32_psllwi256_mask ((__v16hi) __A, __B,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_slli_epi16 (__mmask16 __U, __m256i __A, int __B)
+{
+ return (__m256i) __builtin_ia32_psllwi256_mask ((__v16hi) __A, __B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_slli_epi16 (__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+ return (__m128i) __builtin_ia32_psllwi128_mask ((__v8hi) __A, __B,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, int __B)
+{
+ return (__m128i) __builtin_ia32_psllwi128_mask ((__v8hi) __A, __B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+#else
+#define _mm256_mask_alignr_epi8(W, U, X, Y, N) \
+ ((__m256i) __builtin_ia32_palignr256_mask ((__v4di)(__m256i)(X), \
+ (__v4di)(__m256i)(Y), (int)((N) * 8), \
+ (__v4di)(__m256i)(X), (__mmask32)(U)))
+
+#define _mm256_mask_srli_epi16(W, U, A, B) \
+ ((__m256i) __builtin_ia32_psrlwi256_mask ((__v16hi)(__m256i)(A), \
+ (int)(B), (__v16hi)(__m256i)(W), (__mmask16)(U)))
+
+#define _mm256_maskz_srli_epi16(U, A, B) \
+ ((__m256i) __builtin_ia32_psrlwi256_mask ((__v16hi)(__m256i)(A), \
+ (int)(B), (__v16hi)_mm256_setzero_si256 (), (__mmask16)(U)))
+
+#define _mm_mask_srli_epi16(W, U, A, B) \
+ ((__m128i) __builtin_ia32_psrlwi128_mask ((__v8hi)(__m128i)(A), \
+ (int)(B), (__v8hi)(__m128i)(W), (__mmask8)(U)))
+
+#define _mm_maskz_srli_epi16(U, A, B) \
+ ((__m128i) __builtin_ia32_psrlwi128_mask ((__v8hi)(__m128i)(A), \
+ (int)(B), (__v8hi)_mm_setzero_si128(), (__mmask8)(U)))
+
+#define _mm256_mask_srai_epi16(W, U, A, B) \
+ ((__m256i) __builtin_ia32_psrawi256_mask ((__v16hi)(__m256i)(A), \
+ (int)(B), (__v16hi)(__m256i)(W), (__mmask16)(U)))
+
+#define _mm256_maskz_srai_epi16(U, A, B) \
+ ((__m256i) __builtin_ia32_psrawi256_mask ((__v16hi)(__m256i)(A), \
+ (int)(B), (__v16hi)_mm256_setzero_si256 (), (__mmask16)(U)))
+
+#define _mm_mask_srai_epi16(W, U, A, B) \
+ ((__m128i) __builtin_ia32_psrawi128_mask ((__v8hi)(__m128i)(A), \
+ (int)(B), (__v8hi)(__m128i)(W), (__mmask8)(U)))
+
+#define _mm_maskz_srai_epi16(U, A, B) \
+ ((__m128i) __builtin_ia32_psrawi128_mask ((__v8hi)(__m128i)(A), \
+ (int)(B), (__v8hi)_mm_setzero_si128(), (__mmask8)(U)))
+
+#define _mm256_mask_shufflehi_epi16(W, U, A, B) \
+ ((__m256i) __builtin_ia32_pshufhw256_mask ((__v16hi)(__m256i)(A), (int)(B), \
+ (__v16hi)(__m256i)(W), \
+ (__mmask16)(U)))
+
+#define _mm256_maskz_shufflehi_epi16(U, A, B) \
+ ((__m256i) __builtin_ia32_pshufhw256_mask ((__v16hi)(__m256i)(A), (int)(B), \
+ (__v16hi)(__m256i)_mm256_setzero_si256 (), \
+ (__mmask16)(U)))
+
+#define _mm_mask_shufflehi_epi16(W, U, A, B) \
+ ((__m128i) __builtin_ia32_pshufhw128_mask ((__v8hi)(__m128i)(A), (int)(B), \
+ (__v8hi)(__m128i)(W), \
+ (__mmask8)(U)))
+
+#define _mm_maskz_shufflehi_epi16(U, A, B) \
+ ((__m128i) __builtin_ia32_pshufhw128_mask ((__v8hi)(__m128i)(A), (int)(B), \
+ (__v8hi)(__m128i)_mm_setzero_si128 (), \
+ (__mmask8)(U)))
+
+#define _mm256_mask_shufflelo_epi16(W, U, A, B) \
+ ((__m256i) __builtin_ia32_pshuflw256_mask ((__v16hi)(__m256i)(A), (int)(B), \
+ (__v16hi)(__m256i)(W), \
+ (__mmask16)(U)))
+
+#define _mm256_maskz_shufflelo_epi16(U, A, B) \
+ ((__m256i) __builtin_ia32_pshuflw256_mask ((__v16hi)(__m256i)(A), (int)(B), \
+ (__v16hi)(__m256i)_mm256_setzero_si256 (), \
+ (__mmask16)(U)))
+
+#define _mm_mask_shufflelo_epi16(W, U, A, B) \
+ ((__m128i) __builtin_ia32_pshuflw128_mask ((__v8hi)(__m128i)(A), (int)(B), \
+ (__v8hi)(__m128i)(W), \
+ (__mmask8)(U)))
+
+#define _mm_maskz_shufflelo_epi16(U, A, B) \
+ ((__m128i) __builtin_ia32_pshuflw128_mask ((__v8hi)(__m128i)(A), (int)(B), \
+ (__v8hi)(__m128i)_mm_setzero_si128 (), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_alignr_epi8(U, X, Y, N) \
+ ((__m256i) __builtin_ia32_palignr256_mask ((__v4di)(__m256i)(X), \
+ (__v4di)(__m256i)(Y), (int)((N) * 8), \
+ (__v4di)(__m256i)_mm256_setzero_si256 (), \
+ (__mmask32)(U)))
+
+#define _mm_mask_alignr_epi8(W, U, X, Y, N) \
+ ((__m128i) __builtin_ia32_palignr128_mask ((__v2di)(__m128i)(X), \
+ (__v2di)(__m128i)(Y), (int)((N) * 8), \
+ (__v2di)(__m128i)(X), (__mmask16)(U)))
+
+#define _mm_maskz_alignr_epi8(U, X, Y, N) \
+ ((__m128i) __builtin_ia32_palignr128_mask ((__v2di)(__m128i)(X), \
+ (__v2di)(__m128i)(Y), (int)((N) * 8), \
+ (__v2di)(__m128i)_mm_setzero_si128 (), \
+ (__mmask16)(U)))
+
+#define _mm_mask_slli_epi16(W, U, X, C) \
+ ((__m128i)__builtin_ia32_psllwi128_mask ((__v8hi)(__m128i)(X), (int)(C),\
+ (__v8hi)(__m128i)(W),\
+ (__mmask8)(U)))
+
+#define _mm_maskz_slli_epi16(U, X, C) \
+ ((__m128i)__builtin_ia32_psllwi128_mask ((__v8hi)(__m128i)(X), (int)(C),\
+ (__v8hi)(__m128i)_mm_setzero_si128 (),\
+ (__mmask8)(U)))
+
+#define _mm256_dbsad_epu8(X, Y, C) \
+ ((__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi)(__m256i) (X), \
+ (__v32qi)(__m256i) (Y), (int) (C), \
+ (__v16hi)(__m256i)_mm256_setzero_si256(),\
+ (__mmask16)-1))
+
+#define _mm256_mask_slli_epi16(W, U, X, C) \
+ ((__m256i)__builtin_ia32_psllwi256_mask ((__v16hi)(__m256i)(X), (int)(C),\
+ (__v16hi)(__m256i)(W),\
+ (__mmask16)(U)))
+
+#define _mm256_maskz_slli_epi16(U, X, C) \
+ ((__m256i)__builtin_ia32_psllwi256_mask ((__v16hi)(__m256i)(X), (int)(C),\
+ (__v16hi)(__m256i)_mm256_setzero_si256 (),\
+ (__mmask16)(U)))
+
+#define _mm256_mask_dbsad_epu8(W, U, X, Y, C) \
+ ((__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi)(__m256i) (X), \
+ (__v32qi)(__m256i) (Y), (int) (C), \
+ (__v16hi)(__m256i)(W), \
+ (__mmask16)(U)))
+
+#define _mm256_maskz_dbsad_epu8(U, X, Y, C) \
+ ((__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi)(__m256i) (X), \
+ (__v32qi)(__m256i) (Y), (int) (C), \
+ (__v16hi)(__m256i)_mm256_setzero_si256(),\
+ (__mmask16)(U)))
+
+#define _mm_dbsad_epu8(X, Y, C) \
+ ((__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi)(__m128i) (X), \
+ (__v16qi)(__m128i) (Y), (int) (C), \
+ (__v8hi)(__m128i)_mm_setzero_si128(), \
+ (__mmask8)-1))
+
+#define _mm_mask_dbsad_epu8(W, U, X, Y, C) \
+ ((__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi)(__m128i) (X), \
+ (__v16qi)(__m128i) (Y), (int) (C), \
+ (__v8hi)(__m128i)(W), \
+ (__mmask8)(U)))
+
+#define _mm_maskz_dbsad_epu8(U, X, Y, C) \
+ ((__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi)(__m128i) (X), \
+ (__v16qi)(__m128i) (Y), (int) (C), \
+ (__v8hi)(__m128i)_mm_setzero_si128(), \
+ (__mmask8)(U)))
+
+#define _mm_mask_blend_epi16(__U, __A, __W) \
+ ((__m128i) __builtin_ia32_blendmw_128_mask ((__v8hi) (__A), \
+ (__v8hi) (__W), \
+ (__mmask8) (__U)))
+
+#define _mm_mask_blend_epi8(__U, __A, __W) \
+ ((__m128i) __builtin_ia32_blendmb_128_mask ((__v16qi) (__A), \
+ (__v16qi) (__W), \
+ (__mmask16) (__U)))
+
+#define _mm256_mask_blend_epi16(__U, __A, __W) \
+ ((__m256i) __builtin_ia32_blendmw_256_mask ((__v16hi) (__A), \
+ (__v16hi) (__W), \
+ (__mmask16) (__U)))
+
+#define _mm256_mask_blend_epi8(__U, __A, __W) \
+ ((__m256i) __builtin_ia32_blendmb_256_mask ((__v32qi) (__A), \
+ (__v32qi) (__W), \
+ (__mmask32) (__U)))
+
+#define _mm_cmp_epi16_mask(X, Y, P) \
+ ((__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi)(__m128i)(X), \
+ (__v8hi)(__m128i)(Y), (int)(P),\
+ (__mmask8)(-1)))
+
+#define _mm_cmp_epi8_mask(X, Y, P) \
+ ((__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi)(__m128i)(X), \
+ (__v16qi)(__m128i)(Y), (int)(P),\
+ (__mmask16)(-1)))
+
+#define _mm256_cmp_epi16_mask(X, Y, P) \
+ ((__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi)(__m256i)(X), \
+ (__v16hi)(__m256i)(Y), (int)(P),\
+ (__mmask16)(-1)))
+
+#define _mm256_cmp_epi8_mask(X, Y, P) \
+ ((__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi)(__m256i)(X), \
+ (__v32qi)(__m256i)(Y), (int)(P),\
+ (__mmask32)(-1)))
+
+#define _mm_cmp_epu16_mask(X, Y, P) \
+ ((__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi)(__m128i)(X), \
+ (__v8hi)(__m128i)(Y), (int)(P),\
+ (__mmask8)(-1)))
+
+#define _mm_cmp_epu8_mask(X, Y, P) \
+ ((__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi)(__m128i)(X), \
+ (__v16qi)(__m128i)(Y), (int)(P),\
+ (__mmask16)(-1)))
+
+#define _mm256_cmp_epu16_mask(X, Y, P) \
+ ((__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi)(__m256i)(X), \
+ (__v16hi)(__m256i)(Y), (int)(P),\
+ (__mmask16)(-1)))
+
+#define _mm256_cmp_epu8_mask(X, Y, P) \
+ ((__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi)(__m256i)(X), \
+ (__v32qi)(__m256i)(Y), (int)(P),\
+ (__mmask32)-1))
+
+#define _mm_mask_cmp_epi16_mask(M, X, Y, P) \
+ ((__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi)(__m128i)(X), \
+ (__v8hi)(__m128i)(Y), (int)(P),\
+ (__mmask8)(M)))
+
+#define _mm_mask_cmp_epi8_mask(M, X, Y, P) \
+ ((__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi)(__m128i)(X), \
+ (__v16qi)(__m128i)(Y), (int)(P),\
+ (__mmask16)(M)))
+
+#define _mm256_mask_cmp_epi16_mask(M, X, Y, P) \
+ ((__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi)(__m256i)(X), \
+ (__v16hi)(__m256i)(Y), (int)(P),\
+ (__mmask16)(M)))
+
+#define _mm256_mask_cmp_epi8_mask(M, X, Y, P) \
+ ((__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi)(__m256i)(X), \
+ (__v32qi)(__m256i)(Y), (int)(P),\
+ (__mmask32)(M)))
+
+#define _mm_mask_cmp_epu16_mask(M, X, Y, P) \
+ ((__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi)(__m128i)(X), \
+ (__v8hi)(__m128i)(Y), (int)(P),\
+ (__mmask8)(M)))
+
+#define _mm_mask_cmp_epu8_mask(M, X, Y, P) \
+ ((__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi)(__m128i)(X), \
+ (__v16qi)(__m128i)(Y), (int)(P),\
+ (__mmask16)(M)))
+
+#define _mm256_mask_cmp_epu16_mask(M, X, Y, P) \
+ ((__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi)(__m256i)(X), \
+ (__v16hi)(__m256i)(Y), (int)(P),\
+ (__mmask16)(M)))
+
+#define _mm256_mask_cmp_epu8_mask(M, X, Y, P) \
+ ((__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi)(__m256i)(X), \
+ (__v32qi)(__m256i)(Y), (int)(P),\
+ (__mmask32)(M)))
+#endif
+
+extern __inline __mmask32
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpneq_epi8_mask (__m256i __X, __m256i __Y)
+{
+ return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
+ (__v32qi) __Y, 4,
+ (__mmask32) -1);
+}
+
+extern __inline __mmask32
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmplt_epi8_mask (__m256i __X, __m256i __Y)
+{
+ return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
+ (__v32qi) __Y, 1,
+ (__mmask32) -1);
+}
+
+extern __inline __mmask32
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpge_epi8_mask (__m256i __X, __m256i __Y)
+{
+ return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
+ (__v32qi) __Y, 5,
+ (__mmask32) -1);
+}
+
+extern __inline __mmask32
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmple_epi8_mask (__m256i __X, __m256i __Y)
+{
+ return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
+ (__v32qi) __Y, 2,
+ (__mmask32) -1);
+}
+
+extern __inline __mmask16
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpneq_epi16_mask (__m256i __X, __m256i __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
+ (__v16hi) __Y, 4,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmplt_epi16_mask (__m256i __X, __m256i __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
+ (__v16hi) __Y, 1,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpge_epi16_mask (__m256i __X, __m256i __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
+ (__v16hi) __Y, 5,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmple_epi16_mask (__m256i __X, __m256i __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
+ (__v16hi) __Y, 2,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_epu8_mask (__m128i __X, __m128i __Y)
+{
+ return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
+ (__v16qi) __Y, 4,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epu8_mask (__m128i __X, __m128i __Y)
+{
+ return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
+ (__v16qi) __Y, 1,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_epu8_mask (__m128i __X, __m128i __Y)
+{
+ return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
+ (__v16qi) __Y, 5,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_epu8_mask (__m128i __X, __m128i __Y)
+{
+ return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
+ (__v16qi) __Y, 2,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_epu16_mask (__m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
+ (__v8hi) __Y, 4,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epu16_mask (__m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
+ (__v8hi) __Y, 1,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_epu16_mask (__m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
+ (__v8hi) __Y, 5,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_epu16_mask (__m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
+ (__v8hi) __Y, 2,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask16
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_epi8_mask (__m128i __X, __m128i __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
+ (__v16qi) __Y, 4,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epi8_mask (__m128i __X, __m128i __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
+ (__v16qi) __Y, 1,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_epi8_mask (__m128i __X, __m128i __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
+ (__v16qi) __Y, 5,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_epi8_mask (__m128i __X, __m128i __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
+ (__v16qi) __Y, 2,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_epi16_mask (__m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
+ (__v8hi) __Y, 4,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epi16_mask (__m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
+ (__v8hi) __Y, 1,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_epi16_mask (__m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
+ (__v8hi) __Y, 5,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_epi16_mask (__m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
+ (__v8hi) __Y, 2,
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mulhrs_epi16 (__m256i __W, __mmask16 __U, __m256i __X,
+ __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_pmulhrsw256_mask ((__v16hi) __X,
+ (__v16hi) __Y,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mulhrs_epi16 (__mmask16 __U, __m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_pmulhrsw256_mask ((__v16hi) __X,
+ (__v16hi) __Y,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mulhi_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pmulhuw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mulhi_epu16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pmulhuw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mulhi_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pmulhw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mulhi_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pmulhw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mulhi_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pmulhw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mulhi_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pmulhw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mulhi_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pmulhuw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mulhi_epu16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pmulhuw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mulhrs_epi16 (__m128i __W, __mmask8 __U, __m128i __X,
+ __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_pmulhrsw128_mask ((__v8hi) __X,
+ (__v8hi) __Y,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mulhrs_epi16 (__mmask8 __U, __m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_pmulhrsw128_mask ((__v8hi) __X,
+ (__v8hi) __Y,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mullo_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pmullw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mullo_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pmullw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mullo_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pmullw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mullo_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pmullw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi8_epi16 (__m256i __W, __mmask16 __U, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_pmovsxbw256_mask ((__v16qi) __A,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi8_epi16 (__mmask16 __U, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_pmovsxbw256_mask ((__v16qi) __A,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi8_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsxbw128_mask ((__v16qi) __A,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi8_epi16 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsxbw128_mask ((__v16qi) __A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepu8_epi16 (__m256i __W, __mmask16 __U, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_pmovzxbw256_mask ((__v16qi) __A,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_pmovzxbw256_mask ((__v16qi) __A,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepu8_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovzxbw128_mask ((__v16qi) __A,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepu8_epi16 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovzxbw128_mask ((__v16qi) __A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_avg_epu8 (__m256i __W, __mmask32 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pavgb256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__v32qi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_avg_epu8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pavgb256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__v32qi)
+ _mm256_setzero_si256 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_avg_epu8 (__m128i __W, __mmask16 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pavgb128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__v16qi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_avg_epu8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pavgb128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_avg_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pavgw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_avg_epu16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pavgw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_avg_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pavgw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_avg_epu16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pavgw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_add_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_paddb256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__v32qi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_add_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_paddb256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__v32qi)
+ _mm256_setzero_si256 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_add_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_paddw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_add_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_paddw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_adds_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_paddsb256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__v32qi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_adds_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_paddsb256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__v32qi)
+ _mm256_setzero_si256 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_adds_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_paddsw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_adds_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_paddsw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_adds_epu8 (__m256i __W, __mmask32 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_paddusb256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__v32qi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_adds_epu8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_paddusb256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__v32qi)
+ _mm256_setzero_si256 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_adds_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_paddusw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_adds_epu16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_paddusw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sub_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_psubb256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__v32qi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sub_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_psubb256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__v32qi)
+ _mm256_setzero_si256 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sub_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_psubw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sub_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_psubw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_subs_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_psubsb256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__v32qi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_subs_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_psubsb256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__v32qi)
+ _mm256_setzero_si256 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_subs_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_psubsw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_subs_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_psubsw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_subs_epu8 (__m256i __W, __mmask32 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_psubusb256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__v32qi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_subs_epu8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_psubusb256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__v32qi)
+ _mm256_setzero_si256 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_subs_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_psubusw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_subs_epu16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_psubusw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_add_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_paddb128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__v16qi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_add_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_paddb128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_add_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_paddw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_add_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_paddw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_unpackhi_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_punpckhbw256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__v32qi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_unpackhi_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_punpckhbw256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__v32qi)
+ _mm256_setzero_si256 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_unpackhi_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_punpckhbw128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__v16qi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_unpackhi_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_punpckhbw128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_unpackhi_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_punpckhwd256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_unpackhi_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_punpckhwd256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_unpackhi_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_punpckhwd128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_unpackhi_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_punpckhwd128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_unpacklo_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_punpcklbw256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__v32qi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_unpacklo_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_punpcklbw256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__v32qi)
+ _mm256_setzero_si256 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_unpacklo_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_punpcklbw128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__v16qi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_unpacklo_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_punpcklbw128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_unpacklo_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_punpcklwd256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_unpacklo_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_punpcklwd256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_unpacklo_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_punpcklwd128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_unpacklo_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_punpcklwd128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi8_mask (__m128i __A, __m128i __B)
+{
+ return (__mmask16) __builtin_ia32_pcmpeqb128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epu8_mask (__m128i __A, __m128i __B)
+{
+ return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __A,
+ (__v16qi) __B, 0,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpeq_epu8_mask (__mmask16 __U, __m128i __A, __m128i __B)
+{
+ return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __A,
+ (__v16qi) __B, 0,
+ __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpeq_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B)
+{
+ return (__mmask16) __builtin_ia32_pcmpeqb128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpeq_epu8_mask (__m256i __A, __m256i __B)
+{
+ return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __A,
+ (__v32qi) __B, 0,
+ (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpeq_epi8_mask (__m256i __A, __m256i __B)
+{
+ return (__mmask32) __builtin_ia32_pcmpeqb256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpeq_epu8_mask (__mmask32 __U, __m256i __A, __m256i __B)
+{
+ return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __A,
+ (__v32qi) __B, 0,
+ __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpeq_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B)
+{
+ return (__mmask32) __builtin_ia32_pcmpeqb256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epu16_mask (__m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __A,
+ (__v8hi) __B, 0,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi16_mask (__m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_pcmpeqw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpeq_epu16_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __A,
+ (__v8hi) __B, 0, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpeq_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_pcmpeqw128_mask ((__v8hi) __A,
+ (__v8hi) __B, __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpeq_epu16_mask (__m256i __A, __m256i __B)
+{
+ return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __A,
+ (__v16hi) __B, 0,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpeq_epi16_mask (__m256i __A, __m256i __B)
+{
+ return (__mmask16) __builtin_ia32_pcmpeqw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpeq_epu16_mask (__mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __A,
+ (__v16hi) __B, 0,
+ __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpeq_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__mmask16) __builtin_ia32_pcmpeqw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epu8_mask (__m128i __A, __m128i __B)
+{
+ return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __A,
+ (__v16qi) __B, 6,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi8_mask (__m128i __A, __m128i __B)
+{
+ return (__mmask16) __builtin_ia32_pcmpgtb128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpgt_epu8_mask (__mmask16 __U, __m128i __A, __m128i __B)
+{
+ return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __A,
+ (__v16qi) __B, 6,
+ __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpgt_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B)
+{
+ return (__mmask16) __builtin_ia32_pcmpgtb128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpgt_epu8_mask (__m256i __A, __m256i __B)
+{
+ return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __A,
+ (__v32qi) __B, 6,
+ (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpgt_epi8_mask (__m256i __A, __m256i __B)
+{
+ return (__mmask32) __builtin_ia32_pcmpgtb256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpgt_epu8_mask (__mmask32 __U, __m256i __A, __m256i __B)
+{
+ return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __A,
+ (__v32qi) __B, 6,
+ __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpgt_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B)
+{
+ return (__mmask32) __builtin_ia32_pcmpgtb256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epu16_mask (__m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __A,
+ (__v8hi) __B, 6,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi16_mask (__m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_pcmpgtw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpgt_epu16_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __A,
+ (__v8hi) __B, 6, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpgt_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_pcmpgtw128_mask ((__v8hi) __A,
+ (__v8hi) __B, __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpgt_epu16_mask (__m256i __A, __m256i __B)
+{
+ return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __A,
+ (__v16hi) __B, 6,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpgt_epi16_mask (__m256i __A, __m256i __B)
+{
+ return (__mmask16) __builtin_ia32_pcmpgtw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpgt_epu16_mask (__mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __A,
+ (__v16hi) __B, 6,
+ __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpgt_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__mmask16) __builtin_ia32_pcmpgtw256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testn_epi8_mask (__m128i __A, __m128i __B)
+{
+ return (__mmask16) __builtin_ia32_ptestnmb128 ((__v16qi) __A,
+ (__v16qi) __B,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_testn_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B)
+{
+ return (__mmask16) __builtin_ia32_ptestnmb128 ((__v16qi) __A,
+ (__v16qi) __B, __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testn_epi8_mask (__m256i __A, __m256i __B)
+{
+ return (__mmask32) __builtin_ia32_ptestnmb256 ((__v32qi) __A,
+ (__v32qi) __B,
+ (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_testn_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B)
+{
+ return (__mmask32) __builtin_ia32_ptestnmb256 ((__v32qi) __A,
+ (__v32qi) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testn_epi16_mask (__m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_ptestnmw128 ((__v8hi) __A,
+ (__v8hi) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_testn_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_ptestnmw128 ((__v8hi) __A,
+ (__v8hi) __B, __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testn_epi16_mask (__m256i __A, __m256i __B)
+{
+ return (__mmask16) __builtin_ia32_ptestnmw256 ((__v16hi) __A,
+ (__v16hi) __B,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_testn_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__mmask16) __builtin_ia32_ptestnmw256 ((__v16hi) __A,
+ (__v16hi) __B, __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shuffle_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pshufb256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__v32qi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shuffle_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pshufb256_mask ((__v32qi) __A,
+ (__v32qi) __B,
+ (__v32qi)
+ _mm256_setzero_si256 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shuffle_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pshufb128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__v16qi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shuffle_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pshufb128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_packs_epi16 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_packsswb256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v32qi)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_packs_epi16 (__m256i __W, __mmask32 __M, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_packsswb256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v32qi) __W,
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_packs_epi16 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_packsswb128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_packs_epi16 (__m128i __W, __mmask16 __M, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_packsswb128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v16qi) __W,
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_packus_epi16 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_packuswb256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v32qi)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_packus_epi16 (__m256i __W, __mmask32 __M, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_packuswb256_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v32qi) __W,
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_packus_epi16 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_packuswb128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_packus_epi16 (__m128i __W, __mmask16 __M, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_packuswb128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v16qi) __W,
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_abs_epi8 (__m256i __W, __mmask32 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_pabsb256_mask ((__v32qi) __A,
+ (__v32qi) __W,
+ (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_abs_epi8 (__mmask32 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_pabsb256_mask ((__v32qi) __A,
+ (__v32qi)
+ _mm256_setzero_si256 (),
+ (__mmask32) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_abs_epi8 (__m128i __W, __mmask16 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pabsb128_mask ((__v16qi) __A,
+ (__v16qi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_abs_epi8 (__mmask16 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pabsb128_mask ((__v16qi) __A,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_abs_epi16 (__m256i __W, __mmask16 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_pabsw256_mask ((__v16hi) __A,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_abs_epi16 (__mmask16 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_pabsw256_mask ((__v16hi) __A,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_abs_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pabsw128_mask ((__v8hi) __A,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_abs_epi16 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pabsw128_mask ((__v8hi) __A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __mmask32
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpneq_epu8_mask (__m256i __X, __m256i __Y)
+{
+ return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
+ (__v32qi) __Y, 4,
+ (__mmask32) -1);
+}
+
+extern __inline __mmask32
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmplt_epu8_mask (__m256i __X, __m256i __Y)
+{
+ return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
+ (__v32qi) __Y, 1,
+ (__mmask32) -1);
+}
+
+extern __inline __mmask32
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpge_epu8_mask (__m256i __X, __m256i __Y)
+{
+ return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
+ (__v32qi) __Y, 5,
+ (__mmask32) -1);
+}
+
+extern __inline __mmask32
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmple_epu8_mask (__m256i __X, __m256i __Y)
+{
+ return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
+ (__v32qi) __Y, 2,
+ (__mmask32) -1);
+}
+
+extern __inline __mmask16
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpneq_epu16_mask (__m256i __X, __m256i __Y)
+{
+ return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
+ (__v16hi) __Y, 4,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmplt_epu16_mask (__m256i __X, __m256i __Y)
+{
+ return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
+ (__v16hi) __Y, 1,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpge_epu16_mask (__m256i __X, __m256i __Y)
+{
+ return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
+ (__v16hi) __Y, 5,
+ (__mmask16) -1);
+}
+
+extern __inline __mmask16
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmple_epu16_mask (__m256i __X, __m256i __Y)
+{
+ return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
+ (__v16hi) __Y, 2,
+ (__mmask16) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu_epi16 (void *__P, __m256i __A)
+{
+ *(__v16hi_u *) __P = (__v16hi_u) __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_storeu_epi16 (void *__P, __mmask16 __U, __m256i __A)
+{
+ __builtin_ia32_storedquhi256_mask ((short *) __P,
+ (__v16hi) __A,
+ (__mmask16) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_epi16 (void *__P, __m128i __A)
+{
+ *(__v8hi_u *) __P = (__v8hi_u) __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_storeu_epi16 (void *__P, __mmask8 __U, __m128i __A)
+{
+ __builtin_ia32_storedquhi128_mask ((short *) __P,
+ (__v8hi) __A,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_adds_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_paddsw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_subs_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psubsb128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__v16qi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_subs_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psubsb128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_subs_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psubsw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_subs_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psubsw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_subs_epu8 (__m128i __W, __mmask16 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psubusb128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__v16qi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_subs_epu8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psubusb128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_subs_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psubusw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_subs_epu16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psubusw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_srl_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+ __m128i __B)
+{
+ return (__m256i) __builtin_ia32_psrlw256_mask ((__v16hi) __A,
+ (__v8hi) __B,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_srl_epi16 (__mmask16 __U, __m256i __A, __m128i __B)
+{
+ return (__m256i) __builtin_ia32_psrlw256_mask ((__v16hi) __A,
+ (__v8hi) __B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_srl_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psrlw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_srl_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psrlw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sra_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+ __m128i __B)
+{
+ return (__m256i) __builtin_ia32_psraw256_mask ((__v16hi) __A,
+ (__v8hi) __B,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sra_epi16 (__mmask16 __U, __m256i __A, __m128i __B)
+{
+ return (__m256i) __builtin_ia32_psraw256_mask ((__v16hi) __A,
+ (__v8hi) __B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sra_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psraw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sra_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psraw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_adds_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_paddsw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_adds_epu8 (__m128i __W, __mmask16 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_paddusb128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__v16qi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_adds_epu8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_paddusb128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_adds_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_paddusw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_adds_epu16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_paddusw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sub_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psubb128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__v16qi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sub_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psubb128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sub_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psubw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sub_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psubw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_adds_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_paddsb128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__v16qi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_adds_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_paddsb128_mask ((__v16qi) __A,
+ (__v16qi) __B,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi16_epi8 (__m128i __A)
+{
+
+ return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
+ (__v16qi)_mm_undefined_si128(),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi16_storeu_epi8 (void * __P, __mmask8 __M,__m128i __A)
+{
+ __builtin_ia32_pmovwb128mem_mask ((unsigned long long *) __P , (__v8hi) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
+ (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi16_epi8 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srav_epi16 (__m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_psrav16hi_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_srav_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_psrav16hi_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_srav_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_psrav16hi_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srav_epi16 (__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psrav8hi_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_srav_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psrav8hi_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_srav_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psrav8hi_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srlv_epi16 (__m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_psrlv16hi_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_srlv_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_psrlv16hi_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_srlv_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_psrlv16hi_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srlv_epi16 (__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psrlv8hi_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_srlv_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psrlv8hi_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_srlv_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psrlv8hi_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sllv_epi16 (__m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_psllv16hi_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sllv_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_psllv16hi_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sllv_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_psllv16hi_mask ((__v16hi) __A,
+ (__v16hi) __B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sllv_epi16 (__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psllv8hi_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sllv_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psllv8hi_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sllv_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psllv8hi_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sll_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psllw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sll_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psllw128_mask ((__v8hi) __A,
+ (__v8hi) __B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sll_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+ __m128i __B)
+{
+ return (__m256i) __builtin_ia32_psllw256_mask ((__v16hi) __A,
+ (__v8hi) __B,
+ (__v16hi) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sll_epi16 (__mmask16 __U, __m256i __A, __m128i __B)
+{
+ return (__m256i) __builtin_ia32_psllw256_mask ((__v16hi) __A,
+ (__v8hi) __B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_packus_epi32 (__mmask16 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_packusdw256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_packus_epi32 (__m256i __W, __mmask16 __M, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_packusdw256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v16hi) __W,
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_packus_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_packusdw128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_packus_epi32 (__m128i __W, __mmask8 __M, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_packusdw128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v8hi) __W, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_packs_epi32 (__mmask16 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_packssdw256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v16hi)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_packs_epi32 (__m256i __W, __mmask16 __M, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_packssdw256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v16hi) __W,
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_packs_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_packssdw128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_packs_epi32 (__m128i __W, __mmask8 __M, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_packssdw128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v8hi) __W, __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpneq_epu8_mask (__mmask16 __M, __m128i __X, __m128i __Y)
+{
+ return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
+ (__v16qi) __Y, 4,
+ (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmplt_epu8_mask (__mmask16 __M, __m128i __X, __m128i __Y)
+{
+ return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
+ (__v16qi) __Y, 1,
+ (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpge_epu8_mask (__mmask16 __M, __m128i __X, __m128i __Y)
+{
+ return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
+ (__v16qi) __Y, 5,
+ (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmple_epu8_mask (__mmask16 __M, __m128i __X, __m128i __Y)
+{
+ return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
+ (__v16qi) __Y, 2,
+ (__mmask16) __M);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpneq_epu16_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
+ (__v8hi) __Y, 4,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmplt_epu16_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
+ (__v8hi) __Y, 1,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpge_epu16_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
+ (__v8hi) __Y, 5,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmple_epu16_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
+ (__v8hi) __Y, 2,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpneq_epi8_mask (__mmask16 __M, __m128i __X, __m128i __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
+ (__v16qi) __Y, 4,
+ (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmplt_epi8_mask (__mmask16 __M, __m128i __X, __m128i __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
+ (__v16qi) __Y, 1,
+ (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpge_epi8_mask (__mmask16 __M, __m128i __X, __m128i __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
+ (__v16qi) __Y, 5,
+ (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmple_epi8_mask (__mmask16 __M, __m128i __X, __m128i __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
+ (__v16qi) __Y, 2,
+ (__mmask16) __M);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpneq_epi16_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
+ (__v8hi) __Y, 4,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmplt_epi16_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
+ (__v8hi) __Y, 1,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpge_epi16_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
+ (__v8hi) __Y, 5,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmple_epi16_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
+ (__v8hi) __Y, 2,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpneq_epu8_mask (__mmask32 __M, __m256i __X, __m256i __Y)
+{
+ return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
+ (__v32qi) __Y, 4,
+ (__mmask32) __M);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmplt_epu8_mask (__mmask32 __M, __m256i __X, __m256i __Y)
+{
+ return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
+ (__v32qi) __Y, 1,
+ (__mmask32) __M);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpge_epu8_mask (__mmask32 __M, __m256i __X, __m256i __Y)
+{
+ return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
+ (__v32qi) __Y, 5,
+ (__mmask32) __M);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmple_epu8_mask (__mmask32 __M, __m256i __X, __m256i __Y)
+{
+ return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
+ (__v32qi) __Y, 2,
+ (__mmask32) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpneq_epu16_mask (__mmask16 __M, __m256i __X, __m256i __Y)
+{
+ return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
+ (__v16hi) __Y, 4,
+ (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmplt_epu16_mask (__mmask16 __M, __m256i __X, __m256i __Y)
+{
+ return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
+ (__v16hi) __Y, 1,
+ (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpge_epu16_mask (__mmask16 __M, __m256i __X, __m256i __Y)
+{
+ return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
+ (__v16hi) __Y, 5,
+ (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmple_epu16_mask (__mmask16 __M, __m256i __X, __m256i __Y)
+{
+ return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
+ (__v16hi) __Y, 2,
+ (__mmask16) __M);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpneq_epi8_mask (__mmask32 __M, __m256i __X, __m256i __Y)
+{
+ return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
+ (__v32qi) __Y, 4,
+ (__mmask32) __M);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmplt_epi8_mask (__mmask32 __M, __m256i __X, __m256i __Y)
+{
+ return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
+ (__v32qi) __Y, 1,
+ (__mmask32) __M);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpge_epi8_mask (__mmask32 __M, __m256i __X, __m256i __Y)
+{
+ return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
+ (__v32qi) __Y, 5,
+ (__mmask32) __M);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmple_epi8_mask (__mmask32 __M, __m256i __X, __m256i __Y)
+{
+ return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
+ (__v32qi) __Y, 2,
+ (__mmask32) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpneq_epi16_mask (__mmask16 __M, __m256i __X, __m256i __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
+ (__v16hi) __Y, 4,
+ (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmplt_epi16_mask (__mmask16 __M, __m256i __X, __m256i __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
+ (__v16hi) __Y, 1,
+ (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpge_epi16_mask (__mmask16 __M, __m256i __X, __m256i __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
+ (__v16hi) __Y, 5,
+ (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmple_epi16_mask (__mmask16 __M, __m256i __X, __m256i __Y)
+{
+ return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
+ (__v16hi) __Y, 2,
+ (__mmask16) __M);
+}
+
+#ifdef __DISABLE_AVX512VLBW__
+#undef __DISABLE_AVX512VLBW__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512VLBW__ */
+
+#endif /* _AVX512VLBWINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2014-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512vldqintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512VLDQINTRIN_H_INCLUDED
+#define _AVX512VLDQINTRIN_H_INCLUDED
+
+#if !defined(__AVX512VL__) || !defined(__AVX512DQ__)
+#pragma GCC push_options
+#pragma GCC target("avx512vl,avx512dq")
+#define __DISABLE_AVX512VLDQ__
+#endif /* __AVX512VLDQ__ */
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttpd_epi64 (__m256d __A)
+{
+ return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A)
+{
+ return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttpd_epi64 (__mmask8 __U, __m256d __A)
+{
+ return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttpd_epi64 (__m128d __A)
+{
+ return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A)
+{
+ return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttpd_epi64 (__mmask8 __U, __m128d __A)
+{
+ return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttpd_epu64 (__m256d __A)
+{
+ return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A)
+{
+ return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttpd_epu64 (__mmask8 __U, __m256d __A)
+{
+ return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttpd_epu64 (__m128d __A)
+{
+ return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A)
+{
+ return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttpd_epu64 (__mmask8 __U, __m128d __A)
+{
+ return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtpd_epi64 (__m256d __A)
+{
+ return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A)
+{
+ return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtpd_epi64 (__mmask8 __U, __m256d __A)
+{
+ return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpd_epi64 (__m128d __A)
+{
+ return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A)
+{
+ return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtpd_epi64 (__mmask8 __U, __m128d __A)
+{
+ return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtpd_epu64 (__m256d __A)
+{
+ return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A)
+{
+ return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtpd_epu64 (__mmask8 __U, __m256d __A)
+{
+ return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpd_epu64 (__m128d __A)
+{
+ return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A)
+{
+ return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtpd_epu64 (__mmask8 __U, __m128d __A)
+{
+ return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttps_epi64 (__m128 __A)
+{
+ return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttps_epi64 (__m256i __W, __mmask8 __U, __m128 __A)
+{
+ return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A)
+{
+ return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttps_epi64 (__m128 __A)
+{
+ return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttps_epi64 (__m128i __W, __mmask8 __U, __m128 __A)
+{
+ return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A)
+{
+ return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttps_epu64 (__m128 __A)
+{
+ return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttps_epu64 (__m256i __W, __mmask8 __U, __m128 __A)
+{
+ return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A)
+{
+ return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttps_epu64 (__m128 __A)
+{
+ return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttps_epu64 (__m128i __W, __mmask8 __U, __m128 __A)
+{
+ return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A)
+{
+ return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcast_f64x2 (__m128d __A)
+{
+ return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df)
+ __A,
+ (__v4df)_mm256_undefined_pd(),
+ (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_broadcast_f64x2 (__m256d __O, __mmask8 __M, __m128d __A)
+{
+ return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df)
+ __A,
+ (__v4df)
+ __O, __M);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
+{
+ return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df)
+ __A,
+ (__v4df)
+ _mm256_setzero_ps (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcast_i64x2 (__m128i __A)
+{
+ return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di)
+ __A,
+ (__v4di)_mm256_undefined_si256(),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_broadcast_i64x2 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di)
+ __A,
+ (__v4di)
+ __O, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di)
+ __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcast_f32x2 (__m128 __A)
+{
+ return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
+ (__v8sf)_mm256_undefined_ps(),
+ (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_broadcast_f32x2 (__m256 __O, __mmask8 __M, __m128 __A)
+{
+ return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
+ (__v8sf) __O,
+ __M);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A)
+{
+ return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcast_i32x2 (__m128i __A)
+{
+ return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si)
+ __A,
+ (__v8si)_mm256_undefined_si256(),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_broadcast_i32x2 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si)
+ __A,
+ (__v8si)
+ __O, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si)
+ __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcast_i32x2 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si)
+ __A,
+ (__v4si)_mm_undefined_si128(),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_broadcast_i32x2 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si)
+ __A,
+ (__v4si)
+ __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si)
+ __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mullo_epi64 (__m256i __A, __m256i __B)
+{
+ return (__m256i) ((__v4du) __A * (__v4du) __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mullo_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mullo_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mullo_epi64 (__m128i __A, __m128i __B)
+{
+ return (__m128i) ((__v2du) __A * (__v2du) __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mullo_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mullo_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_andnot_pd (__m256d __W, __mmask8 __U, __m256d __A,
+ __m256d __B)
+{
+ return (__m256d) __builtin_ia32_andnpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_andnot_pd (__mmask8 __U, __m256d __A, __m256d __B)
+{
+ return (__m256d) __builtin_ia32_andnpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_andnot_pd (__m128d __W, __mmask8 __U, __m128d __A,
+ __m128d __B)
+{
+ return (__m128d) __builtin_ia32_andnpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_andnot_pd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_andnpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_andnot_ps (__m256 __W, __mmask8 __U, __m256 __A,
+ __m256 __B)
+{
+ return (__m256) __builtin_ia32_andnps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_andnot_ps (__mmask8 __U, __m256 __A, __m256 __B)
+{
+ return (__m256) __builtin_ia32_andnps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_andnot_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_andnps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_andnot_ps (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_andnps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtps_epi64 (__m128 __A)
+{
+ return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtps_epi64 (__m256i __W, __mmask8 __U, __m128 __A)
+{
+ return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A)
+{
+ return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_epi64 (__m128 __A)
+{
+ return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtps_epi64 (__m128i __W, __mmask8 __U, __m128 __A)
+{
+ return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A)
+{
+ return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtps_epu64 (__m128 __A)
+{
+ return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtps_epu64 (__m256i __W, __mmask8 __U, __m128 __A)
+{
+ return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A)
+{
+ return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_epu64 (__m128 __A)
+{
+ return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtps_epu64 (__m128i __W, __mmask8 __U, __m128 __A)
+{
+ return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A)
+{
+ return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi64_ps (__m256i __A)
+{
+ return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m256i __A)
+{
+ return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi64_ps (__mmask8 __U, __m256i __A)
+{
+ return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi64_ps (__m128i __A)
+{
+ return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi64_ps (__mmask8 __U, __m128i __A)
+{
+ return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu64_ps (__m256i __A)
+{
+ return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m256i __A)
+{
+ return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepu64_ps (__mmask8 __U, __m256i __A)
+{
+ return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu64_ps (__m128i __A)
+{
+ return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepu64_ps (__mmask8 __U, __m128i __A)
+{
+ return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi64_pd (__m256i __A)
+{
+ return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi64_pd (__m256d __W, __mmask8 __U, __m256i __A)
+{
+ return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi64_pd (__mmask8 __U, __m256i __A)
+{
+ return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi64_pd (__m128i __A)
+{
+ return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi64_pd (__m128d __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi64_pd (__mmask8 __U, __m128i __A)
+{
+ return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu64_pd (__m256i __A)
+{
+ return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepu64_pd (__m256d __W, __mmask8 __U, __m256i __A)
+{
+ return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepu64_pd (__mmask8 __U, __m256i __A)
+{
+ return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_and_pd (__m256d __W, __mmask8 __U, __m256d __A,
+ __m256d __B)
+{
+ return (__m256d) __builtin_ia32_andpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_and_pd (__mmask8 __U, __m256d __A, __m256d __B)
+{
+ return (__m256d) __builtin_ia32_andpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_and_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_andpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_and_pd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_andpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_and_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
+{
+ return (__m256) __builtin_ia32_andps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_and_ps (__mmask8 __U, __m256 __A, __m256 __B)
+{
+ return (__m256) __builtin_ia32_andps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_and_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_andps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_and_ps (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_andps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu64_pd (__m128i __A)
+{
+ return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepu64_pd (__m128d __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepu64_pd (__mmask8 __U, __m128i __A)
+{
+ return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_xor_pd (__m256d __W, __mmask8 __U, __m256d __A,
+ __m256d __B)
+{
+ return (__m256d) __builtin_ia32_xorpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_xor_pd (__mmask8 __U, __m256d __A, __m256d __B)
+{
+ return (__m256d) __builtin_ia32_xorpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_xor_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_xorpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_xor_pd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_xorpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_xor_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
+{
+ return (__m256) __builtin_ia32_xorps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_xor_ps (__mmask8 __U, __m256 __A, __m256 __B)
+{
+ return (__m256) __builtin_ia32_xorps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_xor_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_xorps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_xor_ps (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_xorps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_or_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B)
+{
+ return (__m256d) __builtin_ia32_orpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_or_pd (__mmask8 __U, __m256d __A, __m256d __B)
+{
+ return (__m256d) __builtin_ia32_orpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_or_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_orpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_or_pd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_orpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_or_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
+{
+ return (__m256) __builtin_ia32_orps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_or_ps (__mmask8 __U, __m256 __A, __m256 __B)
+{
+ return (__m256) __builtin_ia32_orps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_or_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_orps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_or_ps (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_orps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movm_epi32 (__mmask8 __A)
+{
+ return (__m128i) __builtin_ia32_cvtmask2d128 (__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movm_epi32 (__mmask8 __A)
+{
+ return (__m256i) __builtin_ia32_cvtmask2d256 (__A);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movm_epi64 (__mmask8 __A)
+{
+ return (__m128i) __builtin_ia32_cvtmask2q128 (__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movm_epi64 (__mmask8 __A)
+{
+ return (__m256i) __builtin_ia32_cvtmask2q256 (__A);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movepi32_mask (__m128i __A)
+{
+ return (__mmask8) __builtin_ia32_cvtd2mask128 ((__v4si) __A);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movepi32_mask (__m256i __A)
+{
+ return (__mmask8) __builtin_ia32_cvtd2mask256 ((__v8si) __A);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movepi64_mask (__m128i __A)
+{
+ return (__mmask8) __builtin_ia32_cvtq2mask128 ((__v2di) __A);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movepi64_mask (__m256i __A)
+{
+ return (__mmask8) __builtin_ia32_cvtq2mask256 ((__v4di) __A);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extractf64x2_pd (__m256d __A, const int __imm)
+{
+ return (__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df) __A,
+ __imm,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_extractf64x2_pd (__m128d __W, __mmask8 __U, __m256d __A,
+ const int __imm)
+{
+ return (__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df) __A,
+ __imm,
+ (__v2df) __W,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_extractf64x2_pd (__mmask8 __U, __m256d __A,
+ const int __imm)
+{
+ return (__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df) __A,
+ __imm,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extracti64x2_epi64 (__m256i __A, const int __imm)
+{
+ return (__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di) __A,
+ __imm,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_extracti64x2_epi64 (__m128i __W, __mmask8 __U, __m256i __A,
+ const int __imm)
+{
+ return (__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di) __A,
+ __imm,
+ (__v2di) __W,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_extracti64x2_epi64 (__mmask8 __U, __m256i __A,
+ const int __imm)
+{
+ return (__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di) __A,
+ __imm,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_pd (__m256d __A, int __B)
+{
+ return (__m256d) __builtin_ia32_reducepd256_mask ((__v4df) __A, __B,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_reduce_pd (__m256d __W, __mmask8 __U, __m256d __A, int __B)
+{
+ return (__m256d) __builtin_ia32_reducepd256_mask ((__v4df) __A, __B,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_reduce_pd (__mmask8 __U, __m256d __A, int __B)
+{
+ return (__m256d) __builtin_ia32_reducepd256_mask ((__v4df) __A, __B,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_pd (__m128d __A, int __B)
+{
+ return (__m128d) __builtin_ia32_reducepd128_mask ((__v2df) __A, __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_pd (__m128d __W, __mmask8 __U, __m128d __A, int __B)
+{
+ return (__m128d) __builtin_ia32_reducepd128_mask ((__v2df) __A, __B,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_reduce_pd (__mmask8 __U, __m128d __A, int __B)
+{
+ return (__m128d) __builtin_ia32_reducepd128_mask ((__v2df) __A, __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_ps (__m256 __A, int __B)
+{
+ return (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_reduce_ps (__m256 __W, __mmask8 __U, __m256 __A, int __B)
+{
+ return (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_reduce_ps (__mmask8 __U, __m256 __A, int __B)
+{
+ return (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_ps (__m128 __A, int __B)
+{
+ return (__m128) __builtin_ia32_reduceps128_mask ((__v4sf) __A, __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_ps (__m128 __W, __mmask8 __U, __m128 __A, int __B)
+{
+ return (__m128) __builtin_ia32_reduceps128_mask ((__v4sf) __A, __B,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_reduce_ps (__mmask8 __U, __m128 __A, int __B)
+{
+ return (__m128) __builtin_ia32_reduceps128_mask ((__v4sf) __A, __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_range_pd (__m256d __A, __m256d __B, int __C)
+{
+ return (__m256d) __builtin_ia32_rangepd256_mask ((__v4df) __A,
+ (__v4df) __B, __C,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_range_pd (__m256d __W, __mmask8 __U,
+ __m256d __A, __m256d __B, int __C)
+{
+ return (__m256d) __builtin_ia32_rangepd256_mask ((__v4df) __A,
+ (__v4df) __B, __C,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_range_pd (__mmask8 __U, __m256d __A, __m256d __B, int __C)
+{
+ return (__m256d) __builtin_ia32_rangepd256_mask ((__v4df) __A,
+ (__v4df) __B, __C,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_range_pd (__m128d __A, __m128d __B, int __C)
+{
+ return (__m128d) __builtin_ia32_rangepd128_mask ((__v2df) __A,
+ (__v2df) __B, __C,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_range_pd (__m128d __W, __mmask8 __U,
+ __m128d __A, __m128d __B, int __C)
+{
+ return (__m128d) __builtin_ia32_rangepd128_mask ((__v2df) __A,
+ (__v2df) __B, __C,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_range_pd (__mmask8 __U, __m128d __A, __m128d __B, int __C)
+{
+ return (__m128d) __builtin_ia32_rangepd128_mask ((__v2df) __A,
+ (__v2df) __B, __C,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_range_ps (__m256 __A, __m256 __B, int __C)
+{
+ return (__m256) __builtin_ia32_rangeps256_mask ((__v8sf) __A,
+ (__v8sf) __B, __C,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_range_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B,
+ int __C)
+{
+ return (__m256) __builtin_ia32_rangeps256_mask ((__v8sf) __A,
+ (__v8sf) __B, __C,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_range_ps (__mmask8 __U, __m256 __A, __m256 __B, int __C)
+{
+ return (__m256) __builtin_ia32_rangeps256_mask ((__v8sf) __A,
+ (__v8sf) __B, __C,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_range_ps (__m128 __A, __m128 __B, int __C)
+{
+ return (__m128) __builtin_ia32_rangeps128_mask ((__v4sf) __A,
+ (__v4sf) __B, __C,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_range_ps (__m128 __W, __mmask8 __U,
+ __m128 __A, __m128 __B, int __C)
+{
+ return (__m128) __builtin_ia32_rangeps128_mask ((__v4sf) __A,
+ (__v4sf) __B, __C,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_range_ps (__mmask8 __U, __m128 __A, __m128 __B, int __C)
+{
+ return (__m128) __builtin_ia32_rangeps128_mask ((__v4sf) __A,
+ (__v4sf) __B, __C,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fpclass_pd_mask (__mmask8 __U, __m256d __A,
+ const int __imm)
+{
+ return (__mmask8) __builtin_ia32_fpclasspd256_mask ((__v4df) __A,
+ __imm, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fpclass_pd_mask (__m256d __A, const int __imm)
+{
+ return (__mmask8) __builtin_ia32_fpclasspd256_mask ((__v4df) __A,
+ __imm,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fpclass_ps_mask (__mmask8 __U, __m256 __A, const int __imm)
+{
+ return (__mmask8) __builtin_ia32_fpclassps256_mask ((__v8sf) __A,
+ __imm, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fpclass_ps_mask (__m256 __A, const int __imm)
+{
+ return (__mmask8) __builtin_ia32_fpclassps256_mask ((__v8sf) __A,
+ __imm,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fpclass_pd_mask (__mmask8 __U, __m128d __A, const int __imm)
+{
+ return (__mmask8) __builtin_ia32_fpclasspd128_mask ((__v2df) __A,
+ __imm, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fpclass_pd_mask (__m128d __A, const int __imm)
+{
+ return (__mmask8) __builtin_ia32_fpclasspd128_mask ((__v2df) __A,
+ __imm,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fpclass_ps_mask (__mmask8 __U, __m128 __A, const int __imm)
+{
+ return (__mmask8) __builtin_ia32_fpclassps128_mask ((__v4sf) __A,
+ __imm, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fpclass_ps_mask (__m128 __A, const int __imm)
+{
+ return (__mmask8) __builtin_ia32_fpclassps128_mask ((__v4sf) __A,
+ __imm,
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_inserti64x2 (__m256i __A, __m128i __B, const int __imm)
+{
+ return (__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di) __A,
+ (__v2di) __B,
+ __imm,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_inserti64x2 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m128i __B, const int __imm)
+{
+ return (__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di) __A,
+ (__v2di) __B,
+ __imm,
+ (__v4di) __W,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_inserti64x2 (__mmask8 __U, __m256i __A, __m128i __B,
+ const int __imm)
+{
+ return (__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di) __A,
+ (__v2di) __B,
+ __imm,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_insertf64x2 (__m256d __A, __m128d __B, const int __imm)
+{
+ return (__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df) __A,
+ (__v2df) __B,
+ __imm,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_insertf64x2 (__m256d __W, __mmask8 __U, __m256d __A,
+ __m128d __B, const int __imm)
+{
+ return (__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df) __A,
+ (__v2df) __B,
+ __imm,
+ (__v4df) __W,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_insertf64x2 (__mmask8 __U, __m256d __A, __m128d __B,
+ const int __imm)
+{
+ return (__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df) __A,
+ (__v2df) __B,
+ __imm,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8)
+ __U);
+}
+
+#else
+#define _mm256_insertf64x2(X, Y, C) \
+ ((__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df)(__m256d) (X),\
+ (__v2df)(__m128d) (Y), (int) (C), \
+ (__v4df)(__m256d)_mm256_setzero_pd(), \
+ (__mmask8)-1))
+
+#define _mm256_mask_insertf64x2(W, U, X, Y, C) \
+ ((__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df)(__m256d) (X),\
+ (__v2df)(__m128d) (Y), (int) (C), \
+ (__v4df)(__m256d)(W), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_insertf64x2(U, X, Y, C) \
+ ((__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df)(__m256d) (X),\
+ (__v2df)(__m128d) (Y), (int) (C), \
+ (__v4df)(__m256d)_mm256_setzero_pd(), \
+ (__mmask8)(U)))
+
+#define _mm256_inserti64x2(X, Y, C) \
+ ((__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di)(__m256i) (X),\
+ (__v2di)(__m128i) (Y), (int) (C), \
+ (__v4di)(__m256i)_mm256_setzero_si256 (), \
+ (__mmask8)-1))
+
+#define _mm256_mask_inserti64x2(W, U, X, Y, C) \
+ ((__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di)(__m256i) (X),\
+ (__v2di)(__m128i) (Y), (int) (C), \
+ (__v4di)(__m256i)(W), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_inserti64x2(U, X, Y, C) \
+ ((__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di)(__m256i) (X),\
+ (__v2di)(__m128i) (Y), (int) (C), \
+ (__v4di)(__m256i)_mm256_setzero_si256 (), \
+ (__mmask8)(U)))
+
+#define _mm256_extractf64x2_pd(X, C) \
+ ((__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df)(__m256d) (X),\
+ (int) (C), (__v2df)(__m128d) _mm_setzero_pd(), (__mmask8)-1))
+
+#define _mm256_mask_extractf64x2_pd(W, U, X, C) \
+ ((__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df)(__m256d) (X),\
+ (int) (C), (__v2df)(__m128d) (W), (__mmask8) (U)))
+
+#define _mm256_maskz_extractf64x2_pd(U, X, C) \
+ ((__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df)(__m256d) (X),\
+ (int) (C), (__v2df)(__m128d) _mm_setzero_pd(), (__mmask8) (U)))
+
+#define _mm256_extracti64x2_epi64(X, C) \
+ ((__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di)(__m256i) (X),\
+ (int) (C), (__v2di)(__m128i) _mm_setzero_si128 (), (__mmask8)-1))
+
+#define _mm256_mask_extracti64x2_epi64(W, U, X, C) \
+ ((__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di)(__m256i) (X),\
+ (int) (C), (__v2di)(__m128i) (W), (__mmask8) (U)))
+
+#define _mm256_maskz_extracti64x2_epi64(U, X, C) \
+ ((__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di)(__m256i) (X),\
+ (int) (C), (__v2di)(__m128i) _mm_setzero_si128 (), (__mmask8) (U)))
+
+#define _mm256_reduce_pd(A, B) \
+ ((__m256d) __builtin_ia32_reducepd256_mask ((__v4df)(__m256d)(A), \
+ (int)(B), (__v4df)_mm256_setzero_pd(), (__mmask8)-1))
+
+#define _mm256_mask_reduce_pd(W, U, A, B) \
+ ((__m256d) __builtin_ia32_reducepd256_mask ((__v4df)(__m256d)(A), \
+ (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U)))
+
+#define _mm256_maskz_reduce_pd(U, A, B) \
+ ((__m256d) __builtin_ia32_reducepd256_mask ((__v4df)(__m256d)(A), \
+ (int)(B), (__v4df)_mm256_setzero_pd(), (__mmask8)(U)))
+
+#define _mm_reduce_pd(A, B) \
+ ((__m128d) __builtin_ia32_reducepd128_mask ((__v2df)(__m128d)(A), \
+ (int)(B), (__v2df)_mm_setzero_pd(), (__mmask8)-1))
+
+#define _mm_mask_reduce_pd(W, U, A, B) \
+ ((__m128d) __builtin_ia32_reducepd128_mask ((__v2df)(__m128d)(A), \
+ (int)(B), (__v2df)(__m128d)(W), (__mmask8)(U)))
+
+#define _mm_maskz_reduce_pd(U, A, B) \
+ ((__m128d) __builtin_ia32_reducepd128_mask ((__v2df)(__m128d)(A), \
+ (int)(B), (__v2df)_mm_setzero_pd(), (__mmask8)(U)))
+
+#define _mm256_reduce_ps(A, B) \
+ ((__m256) __builtin_ia32_reduceps256_mask ((__v8sf)(__m256)(A), \
+ (int)(B), (__v8sf)_mm256_setzero_ps(), (__mmask8)-1))
+
+#define _mm256_mask_reduce_ps(W, U, A, B) \
+ ((__m256) __builtin_ia32_reduceps256_mask ((__v8sf)(__m256)(A), \
+ (int)(B), (__v8sf)(__m256)(W), (__mmask8)(U)))
+
+#define _mm256_maskz_reduce_ps(U, A, B) \
+ ((__m256) __builtin_ia32_reduceps256_mask ((__v8sf)(__m256)(A), \
+ (int)(B), (__v8sf)_mm256_setzero_ps(), (__mmask8)(U)))
+
+#define _mm_reduce_ps(A, B) \
+ ((__m128) __builtin_ia32_reduceps128_mask ((__v4sf)(__m128)(A), \
+ (int)(B), (__v4sf)_mm_setzero_ps(), (__mmask8)-1))
+
+#define _mm_mask_reduce_ps(W, U, A, B) \
+ ((__m128) __builtin_ia32_reduceps128_mask ((__v4sf)(__m128)(A), \
+ (int)(B), (__v4sf)(__m128)(W), (__mmask8)(U)))
+
+#define _mm_maskz_reduce_ps(U, A, B) \
+ ((__m128) __builtin_ia32_reduceps128_mask ((__v4sf)(__m128)(A), \
+ (int)(B), (__v4sf)_mm_setzero_ps(), (__mmask8)(U)))
+
+#define _mm256_range_pd(A, B, C) \
+ ((__m256d) __builtin_ia32_rangepd256_mask ((__v4df)(__m256d)(A), \
+ (__v4df)(__m256d)(B), (int)(C), \
+ (__v4df)_mm256_setzero_pd(), (__mmask8)-1))
+
+#define _mm256_maskz_range_pd(U, A, B, C) \
+ ((__m256d) __builtin_ia32_rangepd256_mask ((__v4df)(__m256d)(A), \
+ (__v4df)(__m256d)(B), (int)(C), \
+ (__v4df)_mm256_setzero_pd(), (__mmask8)(U)))
+
+#define _mm_range_pd(A, B, C) \
+ ((__m128d) __builtin_ia32_rangepd128_mask ((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), (int)(C), \
+ (__v2df)_mm_setzero_pd(), (__mmask8)-1))
+
+#define _mm256_range_ps(A, B, C) \
+ ((__m256) __builtin_ia32_rangeps256_mask ((__v8sf)(__m256)(A), \
+ (__v8sf)(__m256)(B), (int)(C), \
+ (__v8sf)_mm256_setzero_ps(), (__mmask8)-1))
+
+#define _mm256_mask_range_ps(W, U, A, B, C) \
+ ((__m256) __builtin_ia32_rangeps256_mask ((__v8sf)(__m256)(A), \
+ (__v8sf)(__m256)(B), (int)(C), \
+ (__v8sf)(__m256)(W), (__mmask8)(U)))
+
+#define _mm256_maskz_range_ps(U, A, B, C) \
+ ((__m256) __builtin_ia32_rangeps256_mask ((__v8sf)(__m256)(A), \
+ (__v8sf)(__m256)(B), (int)(C), \
+ (__v8sf)_mm256_setzero_ps(), (__mmask8)(U)))
+
+#define _mm_range_ps(A, B, C) \
+ ((__m128) __builtin_ia32_rangeps128_mask ((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), (int)(C), \
+ (__v4sf)_mm_setzero_ps(), (__mmask8)-1))
+
+#define _mm_mask_range_ps(W, U, A, B, C) \
+ ((__m128) __builtin_ia32_rangeps128_mask ((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), (int)(C), \
+ (__v4sf)(__m128)(W), (__mmask8)(U)))
+
+#define _mm_maskz_range_ps(U, A, B, C) \
+ ((__m128) __builtin_ia32_rangeps128_mask ((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), (int)(C), \
+ (__v4sf)_mm_setzero_ps(), (__mmask8)(U)))
+
+#define _mm256_mask_range_pd(W, U, A, B, C) \
+ ((__m256d) __builtin_ia32_rangepd256_mask ((__v4df)(__m256d)(A), \
+ (__v4df)(__m256d)(B), (int)(C), \
+ (__v4df)(__m256d)(W), (__mmask8)(U)))
+
+#define _mm_mask_range_pd(W, U, A, B, C) \
+ ((__m128d) __builtin_ia32_rangepd128_mask ((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), (int)(C), \
+ (__v2df)(__m128d)(W), (__mmask8)(U)))
+
+#define _mm_maskz_range_pd(U, A, B, C) \
+ ((__m128d) __builtin_ia32_rangepd128_mask ((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), (int)(C), \
+ (__v2df)_mm_setzero_pd(), (__mmask8)(U)))
+
+#define _mm256_mask_fpclass_pd_mask(u, X, C) \
+ ((__mmask8) __builtin_ia32_fpclasspd256_mask ((__v4df) (__m256d) (X), \
+ (int) (C),(__mmask8)(u)))
+
+#define _mm256_mask_fpclass_ps_mask(u, X, C) \
+ ((__mmask8) __builtin_ia32_fpclassps256_mask ((__v8sf) (__m256) (X), \
+ (int) (C),(__mmask8)(u)))
+
+#define _mm_mask_fpclass_pd_mask(u, X, C) \
+ ((__mmask8) __builtin_ia32_fpclasspd128_mask ((__v2df) (__m128d) (X), \
+ (int) (C),(__mmask8)(u)))
+
+#define _mm_mask_fpclass_ps_mask(u, X, C) \
+ ((__mmask8) __builtin_ia32_fpclassps128_mask ((__v4sf) (__m128) (X), \
+ (int) (C),(__mmask8)(u)))
+
+#define _mm256_fpclass_pd_mask(X, C) \
+ ((__mmask8) __builtin_ia32_fpclasspd256_mask ((__v4df) (__m256d) (X), \
+ (int) (C),(__mmask8)-1))
+
+#define _mm256_fpclass_ps_mask(X, C) \
+ ((__mmask8) __builtin_ia32_fpclassps256_mask ((__v8sf) (__m256) (X), \
+ (int) (C),(__mmask8)-1))
+
+#define _mm_fpclass_pd_mask(X, C) \
+ ((__mmask8) __builtin_ia32_fpclasspd128_mask ((__v2df) (__m128d) (X), \
+ (int) (C),(__mmask8)-1))
+
+#define _mm_fpclass_ps_mask(X, C) \
+ ((__mmask8) __builtin_ia32_fpclassps128_mask ((__v4sf) (__m128) (X), \
+ (int) (C),(__mmask8)-1))
+
+#endif
+
+#ifdef __DISABLE_AVX512VLDQ__
+#undef __DISABLE_AVX512VLDQ__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512VLDQ__ */
+
+#endif /* _AVX512VLDQINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2014-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512vlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512VLINTRIN_H_INCLUDED
+#define _AVX512VLINTRIN_H_INCLUDED
+
+#ifndef __AVX512VL__
+#pragma GCC push_options
+#pragma GCC target("avx512vl")
+#define __DISABLE_AVX512VL__
+#endif /* __AVX512VL__ */
+
+/* Internal data types for implementing the intrinsics. */
+typedef unsigned int __mmask32;
+typedef int __v4si_u __attribute__ ((__vector_size__ (16), \
+ __may_alias__, __aligned__ (1)));
+typedef int __v8si_u __attribute__ ((__vector_size__ (32), \
+ __may_alias__, __aligned__ (1)));
+typedef long long __v2di_u __attribute__ ((__vector_size__ (16), \
+ __may_alias__, __aligned__ (1)));
+typedef long long __v4di_u __attribute__ ((__vector_size__ (32), \
+ __may_alias__, __aligned__ (1)));
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mov_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+ return (__m256d) __builtin_ia32_movapd256_mask ((__v4df) __A,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mov_pd (__mmask8 __U, __m256d __A)
+{
+ return (__m256d) __builtin_ia32_movapd256_mask ((__v4df) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mov_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+ return (__m128d) __builtin_ia32_movapd128_mask ((__v2df) __A,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mov_pd (__mmask8 __U, __m128d __A)
+{
+ return (__m128d) __builtin_ia32_movapd128_mask ((__v2df) __A,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_load_pd (__m256d __W, __mmask8 __U, void const *__P)
+{
+ return (__m256d) __builtin_ia32_loadapd256_mask ((__v4df *) __P,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_load_pd (__mmask8 __U, void const *__P)
+{
+ return (__m256d) __builtin_ia32_loadapd256_mask ((__v4df *) __P,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_load_pd (__m128d __W, __mmask8 __U, void const *__P)
+{
+ return (__m128d) __builtin_ia32_loadapd128_mask ((__v2df *) __P,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_load_pd (__mmask8 __U, void const *__P)
+{
+ return (__m128d) __builtin_ia32_loadapd128_mask ((__v2df *) __P,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_store_pd (void *__P, __mmask8 __U, __m256d __A)
+{
+ __builtin_ia32_storeapd256_mask ((__v4df *) __P,
+ (__v4df) __A,
+ (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_store_pd (void *__P, __mmask8 __U, __m128d __A)
+{
+ __builtin_ia32_storeapd128_mask ((__v2df *) __P,
+ (__v2df) __A,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mov_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+ return (__m256) __builtin_ia32_movaps256_mask ((__v8sf) __A,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mov_ps (__mmask8 __U, __m256 __A)
+{
+ return (__m256) __builtin_ia32_movaps256_mask ((__v8sf) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mov_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+ return (__m128) __builtin_ia32_movaps128_mask ((__v4sf) __A,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mov_ps (__mmask8 __U, __m128 __A)
+{
+ return (__m128) __builtin_ia32_movaps128_mask ((__v4sf) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_load_ps (__m256 __W, __mmask8 __U, void const *__P)
+{
+ return (__m256) __builtin_ia32_loadaps256_mask ((__v8sf *) __P,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_load_ps (__mmask8 __U, void const *__P)
+{
+ return (__m256) __builtin_ia32_loadaps256_mask ((__v8sf *) __P,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_load_ps (__m128 __W, __mmask8 __U, void const *__P)
+{
+ return (__m128) __builtin_ia32_loadaps128_mask ((__v4sf *) __P,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_load_ps (__mmask8 __U, void const *__P)
+{
+ return (__m128) __builtin_ia32_loadaps128_mask ((__v4sf *) __P,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_store_ps (void *__P, __mmask8 __U, __m256 __A)
+{
+ __builtin_ia32_storeaps256_mask ((__v8sf *) __P,
+ (__v8sf) __A,
+ (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_store_ps (void *__P, __mmask8 __U, __m128 __A)
+{
+ __builtin_ia32_storeaps128_mask ((__v4sf *) __P,
+ (__v4sf) __A,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mov_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_movdqa64_256_mask ((__v4di) __A,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mov_epi64 (__mmask8 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_movdqa64_256_mask ((__v4di) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mov_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_movdqa64_128_mask ((__v2di) __A,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mov_epi64 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_movdqa64_128_mask ((__v2di) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_load_epi64 (void const *__P)
+{
+ return (__m256i) (*(__v4di *) __P);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_load_epi64 (__m256i __W, __mmask8 __U, void const *__P)
+{
+ return (__m256i) __builtin_ia32_movdqa64load256_mask ((__v4di *) __P,
+ (__v4di) __W,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_load_epi64 (__mmask8 __U, void const *__P)
+{
+ return (__m256i) __builtin_ia32_movdqa64load256_mask ((__v4di *) __P,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_epi64 (void const *__P)
+{
+ return (__m128i) (*(__v2di *) __P);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_load_epi64 (__m128i __W, __mmask8 __U, void const *__P)
+{
+ return (__m128i) __builtin_ia32_movdqa64load128_mask ((__v2di *) __P,
+ (__v2di) __W,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_load_epi64 (__mmask8 __U, void const *__P)
+{
+ return (__m128i) __builtin_ia32_movdqa64load128_mask ((__v2di *) __P,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8)
+ __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_store_epi64 (void *__P, __mmask8 __U, __m256i __A)
+{
+ __builtin_ia32_movdqa64store256_mask ((__v4di *) __P,
+ (__v4di) __A,
+ (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_store_epi64 (void *__P, __mmask8 __U, __m128i __A)
+{
+ __builtin_ia32_movdqa64store128_mask ((__v2di *) __P,
+ (__v2di) __A,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mov_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_movdqa32_256_mask ((__v8si) __A,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mov_epi32 (__mmask8 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_movdqa32_256_mask ((__v8si) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mov_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_movdqa32_128_mask ((__v4si) __A,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mov_epi32 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_movdqa32_128_mask ((__v4si) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_load_epi32 (void const *__P)
+{
+ return (__m256i) (*(__v8si *) __P);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_load_epi32 (__m256i __W, __mmask8 __U, void const *__P)
+{
+ return (__m256i) __builtin_ia32_movdqa32load256_mask ((__v8si *) __P,
+ (__v8si) __W,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_load_epi32 (__mmask8 __U, void const *__P)
+{
+ return (__m256i) __builtin_ia32_movdqa32load256_mask ((__v8si *) __P,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_epi32 (void const *__P)
+{
+ return (__m128i) (*(__v4si *) __P);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_load_epi32 (__m128i __W, __mmask8 __U, void const *__P)
+{
+ return (__m128i) __builtin_ia32_movdqa32load128_mask ((__v4si *) __P,
+ (__v4si) __W,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_load_epi32 (__mmask8 __U, void const *__P)
+{
+ return (__m128i) __builtin_ia32_movdqa32load128_mask ((__v4si *) __P,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8)
+ __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_store_epi32 (void *__P, __m256i __A)
+{
+ *(__v8si *) __P = (__v8si) __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_store_epi32 (void *__P, __mmask8 __U, __m256i __A)
+{
+ __builtin_ia32_movdqa32store256_mask ((__v8si *) __P,
+ (__v8si) __A,
+ (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_epi32 (void *__P, __m128i __A)
+{
+ *(__v4si *) __P = (__v4si) __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_store_epi32 (void *__P, __mmask8 __U, __m128i __A)
+{
+ __builtin_ia32_movdqa32store128_mask ((__v4si *) __P,
+ (__v4si) __A,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_add_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_addpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_add_pd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_addpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_add_pd (__m256d __W, __mmask8 __U, __m256d __A,
+ __m256d __B)
+{
+ return (__m256d) __builtin_ia32_addpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_add_pd (__mmask8 __U, __m256d __A, __m256d __B)
+{
+ return (__m256d) __builtin_ia32_addpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_add_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_addps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_add_ps (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_addps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_add_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
+{
+ return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_add_ps (__mmask8 __U, __m256 __A, __m256 __B)
+{
+ return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sub_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_subpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sub_pd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_subpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sub_pd (__m256d __W, __mmask8 __U, __m256d __A,
+ __m256d __B)
+{
+ return (__m256d) __builtin_ia32_subpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sub_pd (__mmask8 __U, __m256d __A, __m256d __B)
+{
+ return (__m256d) __builtin_ia32_subpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sub_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_subps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sub_ps (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_subps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sub_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
+{
+ return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sub_ps (__mmask8 __U, __m256 __A, __m256 __B)
+{
+ return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_store_epi64 (void *__P, __m256i __A)
+{
+ *(__m256i *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_epi64 (void *__P, __m128i __A)
+{
+ *(__m128i *) __P = __A;
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_loadu_pd (__m256d __W, __mmask8 __U, void const *__P)
+{
+ return (__m256d) __builtin_ia32_loadupd256_mask ((const double *) __P,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_loadu_pd (__mmask8 __U, void const *__P)
+{
+ return (__m256d) __builtin_ia32_loadupd256_mask ((const double *) __P,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_loadu_pd (__m128d __W, __mmask8 __U, void const *__P)
+{
+ return (__m128d) __builtin_ia32_loadupd128_mask ((const double *) __P,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_loadu_pd (__mmask8 __U, void const *__P)
+{
+ return (__m128d) __builtin_ia32_loadupd128_mask ((const double *) __P,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_storeu_pd (void *__P, __mmask8 __U, __m256d __A)
+{
+ __builtin_ia32_storeupd256_mask ((double *) __P,
+ (__v4df) __A,
+ (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_storeu_pd (void *__P, __mmask8 __U, __m128d __A)
+{
+ __builtin_ia32_storeupd128_mask ((double *) __P,
+ (__v2df) __A,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_loadu_ps (__m256 __W, __mmask8 __U, void const *__P)
+{
+ return (__m256) __builtin_ia32_loadups256_mask ((const float *) __P,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_loadu_ps (__mmask8 __U, void const *__P)
+{
+ return (__m256) __builtin_ia32_loadups256_mask ((const float *) __P,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_loadu_ps (__m128 __W, __mmask8 __U, void const *__P)
+{
+ return (__m128) __builtin_ia32_loadups128_mask ((const float *) __P,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_loadu_ps (__mmask8 __U, void const *__P)
+{
+ return (__m128) __builtin_ia32_loadups128_mask ((const float *) __P,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_storeu_ps (void *__P, __mmask8 __U, __m256 __A)
+{
+ __builtin_ia32_storeups256_mask ((float *) __P,
+ (__v8sf) __A,
+ (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_storeu_ps (void *__P, __mmask8 __U, __m128 __A)
+{
+ __builtin_ia32_storeups128_mask ((float *) __P,
+ (__v4sf) __A,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu_epi64 (void const *__P)
+{
+ return (__m256i) (*(__v4di_u *) __P);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_loadu_epi64 (__m256i __W, __mmask8 __U, void const *__P)
+{
+ return (__m256i) __builtin_ia32_loaddqudi256_mask ((const long long *) __P,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_loadu_epi64 (__mmask8 __U, void const *__P)
+{
+ return (__m256i) __builtin_ia32_loaddqudi256_mask ((const long long *) __P,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_epi64 (void const *__P)
+{
+ return (__m128i) (*(__v2di_u *) __P);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_loadu_epi64 (__m128i __W, __mmask8 __U, void const *__P)
+{
+ return (__m128i) __builtin_ia32_loaddqudi128_mask ((const long long *) __P,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_loadu_epi64 (__mmask8 __U, void const *__P)
+{
+ return (__m128i) __builtin_ia32_loaddqudi128_mask ((const long long *) __P,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu_epi64 (void *__P, __m256i __A)
+{
+ *(__m256i_u *) __P = (__m256i_u) __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_storeu_epi64 (void *__P, __mmask8 __U, __m256i __A)
+{
+ __builtin_ia32_storedqudi256_mask ((long long *) __P,
+ (__v4di) __A,
+ (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_epi64 (void *__P, __m128i __A)
+{
+ *(__m128i_u *) __P = (__m128i_u) __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_storeu_epi64 (void *__P, __mmask8 __U, __m128i __A)
+{
+ __builtin_ia32_storedqudi128_mask ((long long *) __P,
+ (__v2di) __A,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu_epi32 (void const *__P)
+{
+ return (__m256i) (*(__v8si_u *) __P);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_loadu_epi32 (__m256i __W, __mmask8 __U, void const *__P)
+{
+ return (__m256i) __builtin_ia32_loaddqusi256_mask ((const int *) __P,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_loadu_epi32 (__mmask8 __U, void const *__P)
+{
+ return (__m256i) __builtin_ia32_loaddqusi256_mask ((const int *) __P,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_epi32 (void const *__P)
+{
+ return (__m128i) (*(__v4si_u *) __P);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_loadu_epi32 (__m128i __W, __mmask8 __U, void const *__P)
+{
+ return (__m128i) __builtin_ia32_loaddqusi128_mask ((const int *) __P,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_loadu_epi32 (__mmask8 __U, void const *__P)
+{
+ return (__m128i) __builtin_ia32_loaddqusi128_mask ((const int *) __P,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu_epi32 (void *__P, __m256i __A)
+{
+ *(__m256i_u *) __P = (__m256i_u) __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_storeu_epi32 (void *__P, __mmask8 __U, __m256i __A)
+{
+ __builtin_ia32_storedqusi256_mask ((int *) __P,
+ (__v8si) __A,
+ (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_epi32 (void *__P, __m128i __A)
+{
+ *(__m128i_u *) __P = (__m128i_u) __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_storeu_epi32 (void *__P, __mmask8 __U, __m128i __A)
+{
+ __builtin_ia32_storedqusi128_mask ((int *) __P,
+ (__v4si) __A,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_abs_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_pabsd256_mask ((__v8si) __A,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_abs_epi32 (__mmask8 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_pabsd256_mask ((__v8si) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_abs_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pabsd128_mask ((__v4si) __A,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_abs_epi32 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pabsd128_mask ((__v4si) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_abs_epi64 (__m256i __A)
+{
+ return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_abs_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_abs_epi64 (__mmask8 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_epi64 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_abs_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_abs_epi64 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtpd_epu32 (__m256d __A)
+{
+ return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A)
+{
+ return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtpd_epu32 (__mmask8 __U, __m256d __A)
+{
+ return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpd_epu32 (__m128d __A)
+{
+ return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A)
+{
+ return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtpd_epu32 (__mmask8 __U, __m128d __A)
+{
+ return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttps_epi32 (__m256i __W, __mmask8 __U, __m256 __A)
+{
+ return (__m256i) __builtin_ia32_cvttps2dq256_mask ((__v8sf) __A,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttps_epi32 (__mmask8 __U, __m256 __A)
+{
+ return (__m256i) __builtin_ia32_cvttps2dq256_mask ((__v8sf) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttps_epi32 (__m128i __W, __mmask8 __U, __m128 __A)
+{
+ return (__m128i) __builtin_ia32_cvttps2dq128_mask ((__v4sf) __A,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttps_epi32 (__mmask8 __U, __m128 __A)
+{
+ return (__m128i) __builtin_ia32_cvttps2dq128_mask ((__v4sf) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttps_epu32 (__m256 __A)
+{
+ return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttps_epu32 (__m256i __W, __mmask8 __U, __m256 __A)
+{
+ return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttps_epu32 (__mmask8 __U, __m256 __A)
+{
+ return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttps_epu32 (__m128 __A)
+{
+ return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttps_epu32 (__m128i __W, __mmask8 __U, __m128 __A)
+{
+ return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttps_epu32 (__mmask8 __U, __m128 __A)
+{
+ return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A)
+{
+ return (__m128i) __builtin_ia32_cvttpd2dq256_mask ((__v4df) __A,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttpd_epi32 (__mmask8 __U, __m256d __A)
+{
+ return (__m128i) __builtin_ia32_cvttpd2dq256_mask ((__v4df) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A)
+{
+ return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttpd_epi32 (__mmask8 __U, __m128d __A)
+{
+ return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttpd_epu32 (__m256d __A)
+{
+ return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A)
+{
+ return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttpd_epu32 (__mmask8 __U, __m256d __A)
+{
+ return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttpd_epu32 (__m128d __A)
+{
+ return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A)
+{
+ return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttpd_epu32 (__mmask8 __U, __m128d __A)
+{
+ return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A)
+{
+ return (__m128i) __builtin_ia32_cvtpd2dq256_mask ((__v4df) __A,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtpd_epi32 (__mmask8 __U, __m256d __A)
+{
+ return (__m128i) __builtin_ia32_cvtpd2dq256_mask ((__v4df) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A)
+{
+ return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtpd_epi32 (__mmask8 __U, __m128d __A)
+{
+ return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi32_pd (__m256d __W, __mmask8 __U, __m128i __A)
+{
+ return (__m256d) __builtin_ia32_cvtdq2pd256_mask ((__v4si) __A,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A)
+{
+ return (__m256d) __builtin_ia32_cvtdq2pd256_mask ((__v4si) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi32_pd (__m128d __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128d) __builtin_ia32_cvtdq2pd128_mask ((__v4si) __A,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A)
+{
+ return (__m128d) __builtin_ia32_cvtdq2pd128_mask ((__v4si) __A,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu32_pd (__m128i __A)
+{
+ return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepu32_pd (__m256d __W, __mmask8 __U, __m128i __A)
+{
+ return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A)
+{
+ return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu32_pd (__m128i __A)
+{
+ return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepu32_pd (__m128d __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A)
+{
+ return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi32_ps (__m256 __W, __mmask8 __U, __m256i __A)
+{
+ return (__m256) __builtin_ia32_cvtdq2ps256_mask ((__v8si) __A,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi32_ps (__mmask8 __U, __m256i __A)
+{
+ return (__m256) __builtin_ia32_cvtdq2ps256_mask ((__v8si) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi32_ps (__m128 __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128) __builtin_ia32_cvtdq2ps128_mask ((__v4si) __A,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi32_ps (__mmask8 __U, __m128i __A)
+{
+ return (__m128) __builtin_ia32_cvtdq2ps128_mask ((__v4si) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu32_ps (__m256i __A)
+{
+ return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepu32_ps (__m256 __W, __mmask8 __U, __m256i __A)
+{
+ return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepu32_ps (__mmask8 __U, __m256i __A)
+{
+ return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu32_ps (__m128i __A)
+{
+ return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepu32_ps (__m128 __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepu32_ps (__mmask8 __U, __m128i __A)
+{
+ return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtps_pd (__m256d __W, __mmask8 __U, __m128 __A)
+{
+ return (__m256d) __builtin_ia32_cvtps2pd256_mask ((__v4sf) __A,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtps_pd (__mmask8 __U, __m128 __A)
+{
+ return (__m256d) __builtin_ia32_cvtps2pd256_mask ((__v4sf) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtps_pd (__m128d __W, __mmask8 __U, __m128 __A)
+{
+ return (__m128d) __builtin_ia32_cvtps2pd128_mask ((__v4sf) __A,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtps_pd (__mmask8 __U, __m128 __A)
+{
+ return (__m128d) __builtin_ia32_cvtps2pd128_mask ((__v4sf) __A,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi32_epi8 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A,
+ (__v16qi)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovdb128mem_mask ((unsigned int *) __P, (__v4si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A,
+ (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi32_epi8 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi32_epi8 (__m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A,
+ (__v16qi)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A,
+ (__v16qi) __O, __M);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+ __builtin_ia32_pmovdb256mem_mask ((unsigned long long *) __P, (__v8si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi32_epi8 (__mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsepi32_epi8 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A,
+ (__v16qi)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovsdb128mem_mask ((unsigned int *) __P, (__v4si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A,
+ (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtsepi32_epi8 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtsepi32_epi8 (__m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A,
+ (__v16qi)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+ __builtin_ia32_pmovsdb256mem_mask ((unsigned long long *) __P, (__v8si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A,
+ (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtsepi32_epi8 (__mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtusepi32_epi8 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A,
+ (__v16qi)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovusdb128mem_mask ((unsigned int *) __P, (__v4si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A,
+ (__v16qi) __O,
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtusepi32_epi8 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtusepi32_epi8 (__m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A,
+ (__v16qi)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+ __builtin_ia32_pmovusdb256mem_mask ((unsigned long long *) __P, (__v8si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A,
+ (__v16qi) __O,
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtusepi32_epi8 (__mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi32_epi16 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovdw128mem_mask ((unsigned long long *) __P, (__v4si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A,
+ (__v8hi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi32_epi16 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi32_epi16 (__m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+ __builtin_ia32_pmovdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A,
+ (__v8hi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi32_epi16 (__mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsepi32_epi16 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovsdw128mem_mask ((unsigned long long *) __P, (__v4si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A,
+ (__v8hi)__O,
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtsepi32_epi16 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtsepi32_epi16 (__m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A,
+ (__v8hi)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+ __builtin_ia32_pmovsdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A,
+ (__v8hi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtsepi32_epi16 (__mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtusepi32_epi16 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A,
+ (__v8hi)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovusdw128mem_mask ((unsigned long long *) __P, (__v4si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A,
+ (__v8hi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtusepi32_epi16 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtusepi32_epi16 (__m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A,
+ (__v8hi)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+ __builtin_ia32_pmovusdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A,
+ (__v8hi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtusepi32_epi16 (__mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi64_epi8 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A,
+ (__v16qi)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovqb128mem_mask ((unsigned short *) __P, (__v2di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A,
+ (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi64_epi8 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi64_epi8 (__m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A,
+ (__v16qi)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+ __builtin_ia32_pmovqb256mem_mask ((unsigned int *) __P, (__v4di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A,
+ (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi64_epi8 (__mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsepi64_epi8 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A,
+ (__v16qi)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovsqb128mem_mask ((unsigned short *) __P, (__v2di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A,
+ (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtsepi64_epi8 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtsepi64_epi8 (__m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A,
+ (__v16qi)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+ __builtin_ia32_pmovsqb256mem_mask ((unsigned int *) __P, (__v4di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A,
+ (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtsepi64_epi8 (__mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtusepi64_epi8 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A,
+ (__v16qi)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovusqb128mem_mask ((unsigned short *) __P, (__v2di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A,
+ (__v16qi) __O,
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtusepi64_epi8 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtusepi64_epi8 (__m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A,
+ (__v16qi)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+ __builtin_ia32_pmovusqb256mem_mask ((unsigned int *) __P, (__v4di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A,
+ (__v16qi) __O,
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtusepi64_epi8 (__mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A,
+ (__v16qi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi64_epi16 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A,
+ (__v8hi)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovqw128mem_mask ((unsigned int *) __P, (__v2di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A,
+ (__v8hi)__O,
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi64_epi16 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi64_epi16 (__m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A,
+ (__v8hi)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+ __builtin_ia32_pmovqw256mem_mask ((unsigned long long *) __P, (__v4di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A,
+ (__v8hi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi64_epi16 (__mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsepi64_epi16 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A,
+ (__v8hi)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovsqw128mem_mask ((unsigned int *) __P, (__v2di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A,
+ (__v8hi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtsepi64_epi16 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtsepi64_epi16 (__m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A,
+ (__v8hi)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+ __builtin_ia32_pmovsqw256mem_mask ((unsigned long long *) __P, (__v4di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A,
+ (__v8hi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtsepi64_epi16 (__mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtusepi64_epi16 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A,
+ (__v8hi)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovusqw128mem_mask ((unsigned int *) __P, (__v2di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A,
+ (__v8hi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtusepi64_epi16 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtusepi64_epi16 (__m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A,
+ (__v8hi)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+ __builtin_ia32_pmovusqw256mem_mask ((unsigned long long *) __P, (__v4di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A,
+ (__v8hi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtusepi64_epi16 (__mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi64_epi32 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A,
+ (__v4si)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovqd128mem_mask ((unsigned long long *) __P,
+ (__v2di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A,
+ (__v4si) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi64_epi32 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi64_epi32 (__m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A,
+ (__v4si)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A)
+{
+ __builtin_ia32_pmovqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A,
+ (__v4si) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi64_epi32 (__mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsepi64_epi32 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A,
+ (__v4si)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovsqd128mem_mask ((unsigned long long *) __P, (__v2di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A,
+ (__v4si) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtsepi64_epi32 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtsepi64_epi32 (__m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A,
+ (__v4si)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A)
+{
+ __builtin_ia32_pmovsqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A,
+ (__v4si)__O,
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtsepi64_epi32 (__mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtusepi64_epi32 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A,
+ (__v4si)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A)
+{
+ __builtin_ia32_pmovusqd128mem_mask ((unsigned long long *) __P, (__v2di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A,
+ (__v4si) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtusepi64_epi32 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtusepi64_epi32 (__m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A,
+ (__v4si)
+ _mm_undefined_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A)
+{
+ __builtin_ia32_pmovusqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A,
+ (__v4si) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtusepi64_epi32 (__mmask8 __M, __m256i __A)
+{
+ return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_broadcastss_ps (__m256 __O, __mmask8 __M, __m128 __A)
+{
+ return (__m256) __builtin_ia32_broadcastss256_mask ((__v4sf) __A,
+ (__v8sf) __O,
+ __M);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_broadcastss_ps (__mmask8 __M, __m128 __A)
+{
+ return (__m256) __builtin_ia32_broadcastss256_mask ((__v4sf) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ __M);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_broadcastss_ps (__m128 __O, __mmask8 __M, __m128 __A)
+{
+ return (__m128) __builtin_ia32_broadcastss128_mask ((__v4sf) __A,
+ (__v4sf) __O,
+ __M);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_broadcastss_ps (__mmask8 __M, __m128 __A)
+{
+ return (__m128) __builtin_ia32_broadcastss128_mask ((__v4sf) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ __M);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_broadcastsd_pd (__m256d __O, __mmask8 __M, __m128d __A)
+{
+ return (__m256d) __builtin_ia32_broadcastsd256_mask ((__v2df) __A,
+ (__v4df) __O,
+ __M);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
+{
+ return (__m256d) __builtin_ia32_broadcastsd256_mask ((__v2df) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_broadcastd_epi32 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_pbroadcastd256_mask ((__v4si) __A,
+ (__v8si) __O,
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_pbroadcastd256_mask ((__v4si) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_set1_epi32 (__m256i __O, __mmask8 __M, int __A)
+{
+ return (__m256i) __builtin_ia32_pbroadcastd256_gpr_mask (__A, (__v8si) __O,
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_set1_epi32 (__mmask8 __M, int __A)
+{
+ return (__m256i) __builtin_ia32_pbroadcastd256_gpr_mask (__A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_broadcastd_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pbroadcastd128_mask ((__v4si) __A,
+ (__v4si) __O,
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pbroadcastd128_mask ((__v4si) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_set1_epi32 (__m128i __O, __mmask8 __M, int __A)
+{
+ return (__m128i) __builtin_ia32_pbroadcastd128_gpr_mask (__A, (__v4si) __O,
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_set1_epi32 (__mmask8 __M, int __A)
+{
+ return (__m128i)
+ __builtin_ia32_pbroadcastd128_gpr_mask (__A,
+ (__v4si) _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_broadcastq_epi64 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_pbroadcastq256_mask ((__v2di) __A,
+ (__v4di) __O,
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_pbroadcastq256_mask ((__v2di) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_set1_epi64 (__m256i __O, __mmask8 __M, long long __A)
+{
+ return (__m256i) __builtin_ia32_pbroadcastq256_gpr_mask (__A, (__v4di) __O,
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_set1_epi64 (__mmask8 __M, long long __A)
+{
+ return (__m256i) __builtin_ia32_pbroadcastq256_gpr_mask (__A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_broadcastq_epi64 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pbroadcastq128_mask ((__v2di) __A,
+ (__v2di) __O,
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pbroadcastq128_mask ((__v2di) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A)
+{
+ return (__m128i) __builtin_ia32_pbroadcastq128_gpr_mask (__A, (__v2di) __O,
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_set1_epi64 (__mmask8 __M, long long __A)
+{
+ return (__m128i)
+ __builtin_ia32_pbroadcastq128_gpr_mask (__A,
+ (__v2di) _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcast_f32x4 (__m128 __A)
+{
+ return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,
+ (__v8sf)_mm256_undefined_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_broadcast_f32x4 (__m256 __O, __mmask8 __M, __m128 __A)
+{
+ return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,
+ (__v8sf) __O,
+ __M);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_broadcast_f32x4 (__mmask8 __M, __m128 __A)
+{
+ return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcast_i32x4 (__m128i __A)
+{
+ return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si)
+ __A,
+ (__v8si)_mm256_undefined_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_broadcast_i32x4 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si)
+ __A,
+ (__v8si)
+ __O, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_broadcast_i32x4 (__mmask8 __M, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si)
+ __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi8_epi32 (__m256i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_pmovsxbd256_mask ((__v16qi) __A,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_pmovsxbd256_mask ((__v16qi) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi8_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsxbd128_mask ((__v16qi) __A,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsxbd128_mask ((__v16qi) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi8_epi64 (__m256i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_pmovsxbq256_mask ((__v16qi) __A,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_pmovsxbq256_mask ((__v16qi) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi8_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsxbq128_mask ((__v16qi) __A,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsxbq128_mask ((__v16qi) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi16_epi32 (__m256i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_pmovsxwd256_mask ((__v8hi) __A,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_pmovsxwd256_mask ((__v8hi) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi16_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsxwd128_mask ((__v8hi) __A,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsxwd128_mask ((__v8hi) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi16_epi64 (__m256i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_pmovsxwq256_mask ((__v8hi) __A,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_pmovsxwq256_mask ((__v8hi) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi16_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsxwq128_mask ((__v8hi) __A,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovsxwq128_mask ((__v8hi) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi32_epi64 (__m256i __W, __mmask8 __U, __m128i __X)
+{
+ return (__m256i) __builtin_ia32_pmovsxdq256_mask ((__v4si) __X,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi32_epi64 (__mmask8 __U, __m128i __X)
+{
+ return (__m256i) __builtin_ia32_pmovsxdq256_mask ((__v4si) __X,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi32_epi64 (__m128i __W, __mmask8 __U, __m128i __X)
+{
+ return (__m128i) __builtin_ia32_pmovsxdq128_mask ((__v4si) __X,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi32_epi64 (__mmask8 __U, __m128i __X)
+{
+ return (__m128i) __builtin_ia32_pmovsxdq128_mask ((__v4si) __X,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepu8_epi32 (__m256i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_pmovzxbd256_mask ((__v16qi) __A,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepu8_epi32 (__mmask8 __U, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_pmovzxbd256_mask ((__v16qi) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepu8_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovzxbd128_mask ((__v16qi) __A,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepu8_epi32 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovzxbd128_mask ((__v16qi) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepu8_epi64 (__m256i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_pmovzxbq256_mask ((__v16qi) __A,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_pmovzxbq256_mask ((__v16qi) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepu8_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovzxbq128_mask ((__v16qi) __A,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovzxbq128_mask ((__v16qi) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepu16_epi32 (__m256i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_pmovzxwd256_mask ((__v8hi) __A,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepu16_epi32 (__mmask8 __U, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_pmovzxwd256_mask ((__v8hi) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepu16_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovzxwd128_mask ((__v8hi) __A,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepu16_epi32 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovzxwd128_mask ((__v8hi) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepu16_epi64 (__m256i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_pmovzxwq256_mask ((__v8hi) __A,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A)
+{
+ return (__m256i) __builtin_ia32_pmovzxwq256_mask ((__v8hi) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepu16_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovzxwq128_mask ((__v8hi) __A,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_pmovzxwq128_mask ((__v8hi) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepu32_epi64 (__m256i __W, __mmask8 __U, __m128i __X)
+{
+ return (__m256i) __builtin_ia32_pmovzxdq256_mask ((__v4si) __X,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepu32_epi64 (__mmask8 __U, __m128i __X)
+{
+ return (__m256i) __builtin_ia32_pmovzxdq256_mask ((__v4si) __X,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepu32_epi64 (__m128i __W, __mmask8 __U, __m128i __X)
+{
+ return (__m128i) __builtin_ia32_pmovzxdq128_mask ((__v4si) __X,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepu32_epi64 (__mmask8 __U, __m128i __X)
+{
+ return (__m128i) __builtin_ia32_pmovzxdq128_mask ((__v4si) __X,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rcp14_pd (__m256d __A)
+{
+ return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_rcp14_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+ return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_rcp14_pd (__mmask8 __U, __m256d __A)
+{
+ return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp14_pd (__m128d __A)
+{
+ return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rcp14_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+ return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rcp14_pd (__mmask8 __U, __m128d __A)
+{
+ return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rcp14_ps (__m256 __A)
+{
+ return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_rcp14_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+ return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_rcp14_ps (__mmask8 __U, __m256 __A)
+{
+ return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp14_ps (__m128 __A)
+{
+ return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rcp14_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+ return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rcp14_ps (__mmask8 __U, __m128 __A)
+{
+ return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rsqrt14_pd (__m256d __A)
+{
+ return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_rsqrt14_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+ return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_rsqrt14_pd (__mmask8 __U, __m256d __A)
+{
+ return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt14_pd (__m128d __A)
+{
+ return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rsqrt14_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+ return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rsqrt14_pd (__mmask8 __U, __m128d __A)
+{
+ return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rsqrt14_ps (__m256 __A)
+{
+ return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_rsqrt14_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+ return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_rsqrt14_ps (__mmask8 __U, __m256 __A)
+{
+ return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt14_ps (__m128 __A)
+{
+ return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rsqrt14_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+ return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rsqrt14_ps (__mmask8 __U, __m128 __A)
+{
+ return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sqrt_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+ return (__m256d) __builtin_ia32_sqrtpd256_mask ((__v4df) __A,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sqrt_pd (__mmask8 __U, __m256d __A)
+{
+ return (__m256d) __builtin_ia32_sqrtpd256_mask ((__v4df) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sqrt_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+ return (__m128d) __builtin_ia32_sqrtpd128_mask ((__v2df) __A,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sqrt_pd (__mmask8 __U, __m128d __A)
+{
+ return (__m128d) __builtin_ia32_sqrtpd128_mask ((__v2df) __A,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sqrt_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+ return (__m256) __builtin_ia32_sqrtps256_mask ((__v8sf) __A,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sqrt_ps (__mmask8 __U, __m256 __A)
+{
+ return (__m256) __builtin_ia32_sqrtps256_mask ((__v8sf) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sqrt_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+ return (__m128) __builtin_ia32_sqrtps128_mask ((__v4sf) __A,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sqrt_ps (__mmask8 __U, __m128 __A)
+{
+ return (__m128) __builtin_ia32_sqrtps128_mask ((__v4sf) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_add_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_paddd256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_add_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_paddd256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_add_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_paddq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_add_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_paddq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sub_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_psubd256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sub_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_psubd256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sub_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_psubq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sub_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_psubq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_add_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_paddd128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_add_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_paddd128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_add_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_paddq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_add_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_paddq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sub_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psubd128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sub_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psubd128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sub_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psubq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sub_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psubq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_getexp_ps (__m256 __A)
+{
+ return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_getexp_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+ return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_getexp_ps (__mmask8 __U, __m256 __A)
+{
+ return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_getexp_pd (__m256d __A)
+{
+ return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_getexp_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+ return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_getexp_pd (__mmask8 __U, __m256d __A)
+{
+ return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getexp_ps (__m128 __A)
+{
+ return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getexp_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+ return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getexp_ps (__mmask8 __U, __m128 __A)
+{
+ return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getexp_pd (__m128d __A)
+{
+ return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getexp_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+ return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getexp_pd (__mmask8 __U, __m128d __A)
+{
+ return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_srl_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m128i __B)
+{
+ return (__m256i) __builtin_ia32_psrld256_mask ((__v8si) __A,
+ (__v4si) __B,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_srl_epi32 (__mmask8 __U, __m256i __A, __m128i __B)
+{
+ return (__m256i) __builtin_ia32_psrld256_mask ((__v8si) __A,
+ (__v4si) __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_srl_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psrld128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_srl_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psrld128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_srl_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m128i __B)
+{
+ return (__m256i) __builtin_ia32_psrlq256_mask ((__v4di) __A,
+ (__v2di) __B,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_srl_epi64 (__mmask8 __U, __m256i __A, __m128i __B)
+{
+ return (__m256i) __builtin_ia32_psrlq256_mask ((__v4di) __A,
+ (__v2di) __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_srl_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psrlq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_srl_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psrlq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_and_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pandd256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_and_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pandd256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_scalef_pd (__m256d __A, __m256d __B)
+{
+ return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_scalef_pd (__m256d __W, __mmask8 __U, __m256d __A,
+ __m256d __B)
+{
+ return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_scalef_pd (__mmask8 __U, __m256d __A, __m256d __B)
+{
+ return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_scalef_ps (__m256 __A, __m256 __B)
+{
+ return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_scalef_ps (__m256 __W, __mmask8 __U, __m256 __A,
+ __m256 __B)
+{
+ return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B)
+{
+ return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_scalef_pd (__m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_scalef_pd (__m128d __W, __mmask8 __U, __m128d __A,
+ __m128d __B)
+{
+ return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_scalef_pd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_scalef_ps (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_scalef_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_scalef_ps (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmadd_pd (__m256d __A, __mmask8 __U, __m256d __B,
+ __m256d __C)
+{
+ return (__m256d) __builtin_ia32_vfmaddpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmadd_pd (__m256d __A, __m256d __B, __m256d __C,
+ __mmask8 __U)
+{
+ return (__m256d) __builtin_ia32_vfmaddpd256_mask3 ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmadd_pd (__mmask8 __U, __m256d __A, __m256d __B,
+ __m256d __C)
+{
+ return (__m256d) __builtin_ia32_vfmaddpd256_maskz ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmadd_pd (__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmaddpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmadd_pd (__m128d __A, __m128d __B, __m128d __C,
+ __mmask8 __U)
+{
+ return (__m128d) __builtin_ia32_vfmaddpd128_mask3 ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmadd_pd (__mmask8 __U, __m128d __A, __m128d __B,
+ __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmaddpd128_maskz ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmadd_ps (__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+ return (__m256) __builtin_ia32_vfmaddps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmadd_ps (__m256 __A, __m256 __B, __m256 __C,
+ __mmask8 __U)
+{
+ return (__m256) __builtin_ia32_vfmaddps256_mask3 ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmadd_ps (__mmask8 __U, __m256 __A, __m256 __B,
+ __m256 __C)
+{
+ return (__m256) __builtin_ia32_vfmaddps256_maskz ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmadd_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmaddps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmadd_ps (__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+ return (__m128) __builtin_ia32_vfmaddps128_mask3 ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmadd_ps (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmaddps128_maskz ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmsub_pd (__m256d __A, __mmask8 __U, __m256d __B,
+ __m256d __C)
+{
+ return (__m256d) __builtin_ia32_vfmsubpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmsub_pd (__m256d __A, __m256d __B, __m256d __C,
+ __mmask8 __U)
+{
+ return (__m256d) __builtin_ia32_vfmsubpd256_mask3 ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmsub_pd (__mmask8 __U, __m256d __A, __m256d __B,
+ __m256d __C)
+{
+ return (__m256d) __builtin_ia32_vfmsubpd256_maskz ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmsub_pd (__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmsubpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmsub_pd (__m128d __A, __m128d __B, __m128d __C,
+ __mmask8 __U)
+{
+ return (__m128d) __builtin_ia32_vfmsubpd128_mask3 ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmsub_pd (__mmask8 __U, __m128d __A, __m128d __B,
+ __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmsubpd128_maskz ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmsub_ps (__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+ return (__m256) __builtin_ia32_vfmsubps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmsub_ps (__m256 __A, __m256 __B, __m256 __C,
+ __mmask8 __U)
+{
+ return (__m256) __builtin_ia32_vfmsubps256_mask3 ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmsub_ps (__mmask8 __U, __m256 __A, __m256 __B,
+ __m256 __C)
+{
+ return (__m256) __builtin_ia32_vfmsubps256_maskz ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmsub_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmsubps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmsub_ps (__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+ return (__m128) __builtin_ia32_vfmsubps128_mask3 ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmsub_ps (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmsubps128_maskz ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmaddsub_pd (__m256d __A, __mmask8 __U, __m256d __B,
+ __m256d __C)
+{
+ return (__m256d) __builtin_ia32_vfmaddsubpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmaddsub_pd (__m256d __A, __m256d __B, __m256d __C,
+ __mmask8 __U)
+{
+ return (__m256d) __builtin_ia32_vfmaddsubpd256_mask3 ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __C,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmaddsub_pd (__mmask8 __U, __m256d __A, __m256d __B,
+ __m256d __C)
+{
+ return (__m256d) __builtin_ia32_vfmaddsubpd256_maskz ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __C,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmaddsub_pd (__m128d __A, __mmask8 __U, __m128d __B,
+ __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmaddsubpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmaddsub_pd (__m128d __A, __m128d __B, __m128d __C,
+ __mmask8 __U)
+{
+ return (__m128d) __builtin_ia32_vfmaddsubpd128_mask3 ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __C,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmaddsub_pd (__mmask8 __U, __m128d __A, __m128d __B,
+ __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmaddsubpd128_maskz ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __C,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmaddsub_ps (__m256 __A, __mmask8 __U, __m256 __B,
+ __m256 __C)
+{
+ return (__m256) __builtin_ia32_vfmaddsubps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmaddsub_ps (__m256 __A, __m256 __B, __m256 __C,
+ __mmask8 __U)
+{
+ return (__m256) __builtin_ia32_vfmaddsubps256_mask3 ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmaddsub_ps (__mmask8 __U, __m256 __A, __m256 __B,
+ __m256 __C)
+{
+ return (__m256) __builtin_ia32_vfmaddsubps256_maskz ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmaddsub_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmaddsubps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmaddsub_ps (__m128 __A, __m128 __B, __m128 __C,
+ __mmask8 __U)
+{
+ return (__m128) __builtin_ia32_vfmaddsubps128_mask3 ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmaddsub_ps (__mmask8 __U, __m128 __A, __m128 __B,
+ __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmaddsubps128_maskz ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmsubadd_pd (__m256d __A, __mmask8 __U, __m256d __B,
+ __m256d __C)
+{
+ return (__m256d) __builtin_ia32_vfmaddsubpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ -(__v4df) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmsubadd_pd (__m256d __A, __m256d __B, __m256d __C,
+ __mmask8 __U)
+{
+ return (__m256d) __builtin_ia32_vfmsubaddpd256_mask3 ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __C,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmsubadd_pd (__mmask8 __U, __m256d __A, __m256d __B,
+ __m256d __C)
+{
+ return (__m256d) __builtin_ia32_vfmaddsubpd256_maskz ((__v4df) __A,
+ (__v4df) __B,
+ -(__v4df) __C,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmsubadd_pd (__m128d __A, __mmask8 __U, __m128d __B,
+ __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmaddsubpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ -(__v2df) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmsubadd_pd (__m128d __A, __m128d __B, __m128d __C,
+ __mmask8 __U)
+{
+ return (__m128d) __builtin_ia32_vfmsubaddpd128_mask3 ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __C,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmsubadd_pd (__mmask8 __U, __m128d __A, __m128d __B,
+ __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmaddsubpd128_maskz ((__v2df) __A,
+ (__v2df) __B,
+ -(__v2df) __C,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmsubadd_ps (__m256 __A, __mmask8 __U, __m256 __B,
+ __m256 __C)
+{
+ return (__m256) __builtin_ia32_vfmaddsubps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ -(__v8sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmsubadd_ps (__m256 __A, __m256 __B, __m256 __C,
+ __mmask8 __U)
+{
+ return (__m256) __builtin_ia32_vfmsubaddps256_mask3 ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmsubadd_ps (__mmask8 __U, __m256 __A, __m256 __B,
+ __m256 __C)
+{
+ return (__m256) __builtin_ia32_vfmaddsubps256_maskz ((__v8sf) __A,
+ (__v8sf) __B,
+ -(__v8sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmsubadd_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmaddsubps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ -(__v4sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmsubadd_ps (__m128 __A, __m128 __B, __m128 __C,
+ __mmask8 __U)
+{
+ return (__m128) __builtin_ia32_vfmsubaddps128_mask3 ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmsubadd_ps (__mmask8 __U, __m128 __A, __m128 __B,
+ __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmaddsubps128_maskz ((__v4sf) __A,
+ (__v4sf) __B,
+ -(__v4sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fnmadd_pd (__m256d __A, __mmask8 __U, __m256d __B,
+ __m256d __C)
+{
+ return (__m256d) __builtin_ia32_vfnmaddpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fnmadd_pd (__m256d __A, __m256d __B, __m256d __C,
+ __mmask8 __U)
+{
+ return (__m256d) __builtin_ia32_vfnmaddpd256_mask3 ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fnmadd_pd (__mmask8 __U, __m256d __A, __m256d __B,
+ __m256d __C)
+{
+ return (__m256d) __builtin_ia32_vfnmaddpd256_maskz ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmadd_pd (__m128d __A, __mmask8 __U, __m128d __B,
+ __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfnmaddpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmadd_pd (__m128d __A, __m128d __B, __m128d __C,
+ __mmask8 __U)
+{
+ return (__m128d) __builtin_ia32_vfnmaddpd128_mask3 ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmadd_pd (__mmask8 __U, __m128d __A, __m128d __B,
+ __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfnmaddpd128_maskz ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fnmadd_ps (__m256 __A, __mmask8 __U, __m256 __B,
+ __m256 __C)
+{
+ return (__m256) __builtin_ia32_vfnmaddps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fnmadd_ps (__m256 __A, __m256 __B, __m256 __C,
+ __mmask8 __U)
+{
+ return (__m256) __builtin_ia32_vfnmaddps256_mask3 ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fnmadd_ps (__mmask8 __U, __m256 __A, __m256 __B,
+ __m256 __C)
+{
+ return (__m256) __builtin_ia32_vfnmaddps256_maskz ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmadd_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfnmaddps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmadd_ps (__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+ return (__m128) __builtin_ia32_vfnmaddps128_mask3 ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmadd_ps (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfnmaddps128_maskz ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fnmsub_pd (__m256d __A, __mmask8 __U, __m256d __B,
+ __m256d __C)
+{
+ return (__m256d) __builtin_ia32_vfnmsubpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fnmsub_pd (__m256d __A, __m256d __B, __m256d __C,
+ __mmask8 __U)
+{
+ return (__m256d) __builtin_ia32_vfnmsubpd256_mask3 ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fnmsub_pd (__mmask8 __U, __m256d __A, __m256d __B,
+ __m256d __C)
+{
+ return (__m256d) __builtin_ia32_vfnmsubpd256_maskz ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmsub_pd (__m128d __A, __mmask8 __U, __m128d __B,
+ __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfnmsubpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmsub_pd (__m128d __A, __m128d __B, __m128d __C,
+ __mmask8 __U)
+{
+ return (__m128d) __builtin_ia32_vfnmsubpd128_mask3 ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmsub_pd (__mmask8 __U, __m128d __A, __m128d __B,
+ __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfnmsubpd128_maskz ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fnmsub_ps (__m256 __A, __mmask8 __U, __m256 __B,
+ __m256 __C)
+{
+ return (__m256) __builtin_ia32_vfnmsubps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fnmsub_ps (__m256 __A, __m256 __B, __m256 __C,
+ __mmask8 __U)
+{
+ return (__m256) __builtin_ia32_vfnmsubps256_mask3 ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fnmsub_ps (__mmask8 __U, __m256 __A, __m256 __B,
+ __m256 __C)
+{
+ return (__m256) __builtin_ia32_vfnmsubps256_maskz ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmsub_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfnmsubps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmsub_ps (__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+ return (__m128) __builtin_ia32_vfnmsubps128_mask3 ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmsub_ps (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfnmsubps128_maskz ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __C,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_and_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pandd128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_and_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pandd128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_andnot_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pandnd256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_andnot_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pandnd256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_andnot_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pandnd128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_andnot_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pandnd128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_or_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pord256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_or_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pord256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_or_epi32 (__m256i __A, __m256i __B)
+{
+ return (__m256i) ((__v8su)__A | (__v8su)__B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_or_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pord128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_or_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pord128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_or_epi32 (__m128i __A, __m128i __B)
+{
+ return (__m128i) ((__v4su)__A | (__v4su)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_xor_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pxord256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_xor_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pxord256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_xor_epi32 (__m256i __A, __m256i __B)
+{
+ return (__m256i) ((__v8su)__A ^ (__v8su)__B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_xor_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pxord128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_xor_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pxord128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_xor_epi32 (__m128i __A, __m128i __B)
+{
+ return (__m128i) ((__v4su)__A ^ (__v4su)__B);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m128d __A)
+{
+ return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtpd_ps (__mmask8 __U, __m128d __A)
+{
+ return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m256d __A)
+{
+ return (__m128) __builtin_ia32_cvtpd2ps256_mask ((__v4df) __A,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtpd_ps (__mmask8 __U, __m256d __A)
+{
+ return (__m128) __builtin_ia32_cvtpd2ps256_mask ((__v4df) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtps_epi32 (__m256i __W, __mmask8 __U, __m256 __A)
+{
+ return (__m256i) __builtin_ia32_cvtps2dq256_mask ((__v8sf) __A,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtps_epi32 (__mmask8 __U, __m256 __A)
+{
+ return (__m256i) __builtin_ia32_cvtps2dq256_mask ((__v8sf) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtps_epi32 (__m128i __W, __mmask8 __U, __m128 __A)
+{
+ return (__m128i) __builtin_ia32_cvtps2dq128_mask ((__v4sf) __A,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtps_epi32 (__mmask8 __U, __m128 __A)
+{
+ return (__m128i) __builtin_ia32_cvtps2dq128_mask ((__v4sf) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtps_epu32 (__m256 __A)
+{
+ return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtps_epu32 (__m256i __W, __mmask8 __U, __m256 __A)
+{
+ return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtps_epu32 (__mmask8 __U, __m256 __A)
+{
+ return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_epu32 (__m128 __A)
+{
+ return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtps_epu32 (__m128i __W, __mmask8 __U, __m128 __A)
+{
+ return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtps_epu32 (__mmask8 __U, __m128 __A)
+{
+ return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_movedup_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+ return (__m256d) __builtin_ia32_movddup256_mask ((__v4df) __A,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_movedup_pd (__mmask8 __U, __m256d __A)
+{
+ return (__m256d) __builtin_ia32_movddup256_mask ((__v4df) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_movedup_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+ return (__m128d) __builtin_ia32_movddup128_mask ((__v2df) __A,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_movedup_pd (__mmask8 __U, __m128d __A)
+{
+ return (__m128d) __builtin_ia32_movddup128_mask ((__v2df) __A,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_movehdup_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+ return (__m256) __builtin_ia32_movshdup256_mask ((__v8sf) __A,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_movehdup_ps (__mmask8 __U, __m256 __A)
+{
+ return (__m256) __builtin_ia32_movshdup256_mask ((__v8sf) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_movehdup_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+ return (__m128) __builtin_ia32_movshdup128_mask ((__v4sf) __A,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_movehdup_ps (__mmask8 __U, __m128 __A)
+{
+ return (__m128) __builtin_ia32_movshdup128_mask ((__v4sf) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_moveldup_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+ return (__m256) __builtin_ia32_movsldup256_mask ((__v8sf) __A,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_moveldup_ps (__mmask8 __U, __m256 __A)
+{
+ return (__m256) __builtin_ia32_movsldup256_mask ((__v8sf) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_moveldup_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+ return (__m128) __builtin_ia32_movsldup128_mask ((__v4sf) __A,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_moveldup_ps (__mmask8 __U, __m128 __A)
+{
+ return (__m128) __builtin_ia32_movsldup128_mask ((__v4sf) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_unpackhi_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_punpckhdq128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_unpackhi_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_punpckhdq128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_unpackhi_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_punpckhdq256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_unpackhi_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_punpckhdq256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_unpackhi_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_punpckhqdq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_unpackhi_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_punpckhqdq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_unpackhi_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_punpckhqdq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_unpackhi_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_punpckhqdq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_unpacklo_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_punpckldq128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_unpacklo_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_punpckldq128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_unpacklo_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_punpckldq256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_unpacklo_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_punpckldq256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_unpacklo_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_punpcklqdq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_unpacklo_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_punpcklqdq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_unpacklo_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_punpcklqdq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_unpacklo_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_punpcklqdq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epu32_mask (__m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __A,
+ (__v4si) __B, 0,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi32_mask (__m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_pcmpeqd128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpeq_epu32_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __A,
+ (__v4si) __B, 0, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpeq_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_pcmpeqd128_mask ((__v4si) __A,
+ (__v4si) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpeq_epu32_mask (__m256i __A, __m256i __B)
+{
+ return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __A,
+ (__v8si) __B, 0,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpeq_epi32_mask (__m256i __A, __m256i __B)
+{
+ return (__mmask8) __builtin_ia32_pcmpeqd256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpeq_epu32_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __A,
+ (__v8si) __B, 0, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpeq_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__mmask8) __builtin_ia32_pcmpeqd256_mask ((__v8si) __A,
+ (__v8si) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epu64_mask (__m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __A,
+ (__v2di) __B, 0,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi64_mask (__m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_pcmpeqq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpeq_epu64_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __A,
+ (__v2di) __B, 0, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpeq_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_pcmpeqq128_mask ((__v2di) __A,
+ (__v2di) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpeq_epu64_mask (__m256i __A, __m256i __B)
+{
+ return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __A,
+ (__v4di) __B, 0,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpeq_epi64_mask (__m256i __A, __m256i __B)
+{
+ return (__mmask8) __builtin_ia32_pcmpeqq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpeq_epu64_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __A,
+ (__v4di) __B, 0, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpeq_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__mmask8) __builtin_ia32_pcmpeqq256_mask ((__v4di) __A,
+ (__v4di) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epu32_mask (__m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __A,
+ (__v4si) __B, 6,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi32_mask (__m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_pcmpgtd128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpgt_epu32_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __A,
+ (__v4si) __B, 6, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpgt_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_pcmpgtd128_mask ((__v4si) __A,
+ (__v4si) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpgt_epu32_mask (__m256i __A, __m256i __B)
+{
+ return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __A,
+ (__v8si) __B, 6,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpgt_epi32_mask (__m256i __A, __m256i __B)
+{
+ return (__mmask8) __builtin_ia32_pcmpgtd256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpgt_epu32_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __A,
+ (__v8si) __B, 6, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpgt_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__mmask8) __builtin_ia32_pcmpgtd256_mask ((__v8si) __A,
+ (__v8si) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epu64_mask (__m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __A,
+ (__v2di) __B, 6,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi64_mask (__m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_pcmpgtq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpgt_epu64_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __A,
+ (__v2di) __B, 6, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpgt_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_pcmpgtq128_mask ((__v2di) __A,
+ (__v2di) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpgt_epu64_mask (__m256i __A, __m256i __B)
+{
+ return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __A,
+ (__v4di) __B, 6,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpgt_epi64_mask (__m256i __A, __m256i __B)
+{
+ return (__mmask8) __builtin_ia32_pcmpgtq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpgt_epu64_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __A,
+ (__v4di) __B, 6, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpgt_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__mmask8) __builtin_ia32_pcmpgtq256_mask ((__v4di) __A,
+ (__v4di) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_test_epi32_mask (__m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_ptestmd128 ((__v4si) __A,
+ (__v4si) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_test_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_ptestmd128 ((__v4si) __A,
+ (__v4si) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_test_epi32_mask (__m256i __A, __m256i __B)
+{
+ return (__mmask8) __builtin_ia32_ptestmd256 ((__v8si) __A,
+ (__v8si) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_test_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__mmask8) __builtin_ia32_ptestmd256 ((__v8si) __A,
+ (__v8si) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_test_epi64_mask (__m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_ptestmq128 ((__v2di) __A,
+ (__v2di) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_test_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_ptestmq128 ((__v2di) __A,
+ (__v2di) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_test_epi64_mask (__m256i __A, __m256i __B)
+{
+ return (__mmask8) __builtin_ia32_ptestmq256 ((__v4di) __A,
+ (__v4di) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_test_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__mmask8) __builtin_ia32_ptestmq256 ((__v4di) __A,
+ (__v4di) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testn_epi32_mask (__m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_ptestnmd128 ((__v4si) __A,
+ (__v4si) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_testn_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_ptestnmd128 ((__v4si) __A,
+ (__v4si) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testn_epi32_mask (__m256i __A, __m256i __B)
+{
+ return (__mmask8) __builtin_ia32_ptestnmd256 ((__v8si) __A,
+ (__v8si) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_testn_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__mmask8) __builtin_ia32_ptestnmd256 ((__v8si) __A,
+ (__v8si) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testn_epi64_mask (__m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_ptestnmq128 ((__v2di) __A,
+ (__v2di) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_testn_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__mmask8) __builtin_ia32_ptestnmq128 ((__v2di) __A,
+ (__v2di) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testn_epi64_mask (__m256i __A, __m256i __B)
+{
+ return (__mmask8) __builtin_ia32_ptestnmq256 ((__v4di) __A,
+ (__v4di) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_testn_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__mmask8) __builtin_ia32_ptestnmq256 ((__v4di) __A,
+ (__v4di) __B, __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_compress_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+ return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_compress_pd (__mmask8 __U, __m256d __A)
+{
+ return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m256d __A)
+{
+ __builtin_ia32_compressstoredf256_mask ((__v4df *) __P,
+ (__v4df) __A,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_compress_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+ return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_compress_pd (__mmask8 __U, __m128d __A)
+{
+ return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m128d __A)
+{
+ __builtin_ia32_compressstoredf128_mask ((__v2df *) __P,
+ (__v2df) __A,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_compress_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+ return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_compress_ps (__mmask8 __U, __m256 __A)
+{
+ return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m256 __A)
+{
+ __builtin_ia32_compressstoresf256_mask ((__v8sf *) __P,
+ (__v8sf) __A,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_compress_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+ return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_compress_ps (__mmask8 __U, __m128 __A)
+{
+ return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m128 __A)
+{
+ __builtin_ia32_compressstoresf128_mask ((__v4sf *) __P,
+ (__v4sf) __A,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_compress_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_compress_epi64 (__mmask8 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m256i __A)
+{
+ __builtin_ia32_compressstoredi256_mask ((__v4di *) __P,
+ (__v4di) __A,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_compress_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_compress_epi64 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m128i __A)
+{
+ __builtin_ia32_compressstoredi128_mask ((__v2di *) __P,
+ (__v2di) __A,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_compress_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_compress_epi32 (__mmask8 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m256i __A)
+{
+ __builtin_ia32_compressstoresi256_mask ((__v8si *) __P,
+ (__v8si) __A,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_compress_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_compress_epi32 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m128i __A)
+{
+ __builtin_ia32_compressstoresi128_mask ((__v4si *) __P,
+ (__v4si) __A,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_expand_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+ return (__m256d) __builtin_ia32_expanddf256_mask ((__v4df) __A,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_expand_pd (__mmask8 __U, __m256d __A)
+{
+ return (__m256d) __builtin_ia32_expanddf256_maskz ((__v4df) __A,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_expandloadu_pd (__m256d __W, __mmask8 __U, void const *__P)
+{
+ return (__m256d) __builtin_ia32_expandloaddf256_mask ((__v4df *) __P,
+ (__v4df) __W,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_expandloadu_pd (__mmask8 __U, void const *__P)
+{
+ return (__m256d) __builtin_ia32_expandloaddf256_maskz ((__v4df *) __P,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_expand_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+ return (__m128d) __builtin_ia32_expanddf128_mask ((__v2df) __A,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_expand_pd (__mmask8 __U, __m128d __A)
+{
+ return (__m128d) __builtin_ia32_expanddf128_maskz ((__v2df) __A,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_expandloadu_pd (__m128d __W, __mmask8 __U, void const *__P)
+{
+ return (__m128d) __builtin_ia32_expandloaddf128_mask ((__v2df *) __P,
+ (__v2df) __W,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_expandloadu_pd (__mmask8 __U, void const *__P)
+{
+ return (__m128d) __builtin_ia32_expandloaddf128_maskz ((__v2df *) __P,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_expand_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+ return (__m256) __builtin_ia32_expandsf256_mask ((__v8sf) __A,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_expand_ps (__mmask8 __U, __m256 __A)
+{
+ return (__m256) __builtin_ia32_expandsf256_maskz ((__v8sf) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_expandloadu_ps (__m256 __W, __mmask8 __U, void const *__P)
+{
+ return (__m256) __builtin_ia32_expandloadsf256_mask ((__v8sf *) __P,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_expandloadu_ps (__mmask8 __U, void const *__P)
+{
+ return (__m256) __builtin_ia32_expandloadsf256_maskz ((__v8sf *) __P,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_expand_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+ return (__m128) __builtin_ia32_expandsf128_mask ((__v4sf) __A,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_expand_ps (__mmask8 __U, __m128 __A)
+{
+ return (__m128) __builtin_ia32_expandsf128_maskz ((__v4sf) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_expandloadu_ps (__m128 __W, __mmask8 __U, void const *__P)
+{
+ return (__m128) __builtin_ia32_expandloadsf128_mask ((__v4sf *) __P,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_expandloadu_ps (__mmask8 __U, void const *__P)
+{
+ return (__m128) __builtin_ia32_expandloadsf128_maskz ((__v4sf *) __P,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_expand_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_expanddi256_mask ((__v4di) __A,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_expand_epi64 (__mmask8 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_expanddi256_maskz ((__v4di) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_expandloadu_epi64 (__m256i __W, __mmask8 __U,
+ void const *__P)
+{
+ return (__m256i) __builtin_ia32_expandloaddi256_mask ((__v4di *) __P,
+ (__v4di) __W,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P)
+{
+ return (__m256i) __builtin_ia32_expandloaddi256_maskz ((__v4di *) __P,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_expand_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_expanddi128_mask ((__v2di) __A,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_expand_epi64 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_expanddi128_maskz ((__v2di) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_expandloadu_epi64 (__m128i __W, __mmask8 __U, void const *__P)
+{
+ return (__m128i) __builtin_ia32_expandloaddi128_mask ((__v2di *) __P,
+ (__v2di) __W,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P)
+{
+ return (__m128i) __builtin_ia32_expandloaddi128_maskz ((__v2di *) __P,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_expand_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_expandsi256_mask ((__v8si) __A,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_expand_epi32 (__mmask8 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_expandsi256_maskz ((__v8si) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_expandloadu_epi32 (__m256i __W, __mmask8 __U,
+ void const *__P)
+{
+ return (__m256i) __builtin_ia32_expandloadsi256_mask ((__v8si *) __P,
+ (__v8si) __W,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P)
+{
+ return (__m256i) __builtin_ia32_expandloadsi256_maskz ((__v8si *) __P,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_expand_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_expandsi128_mask ((__v4si) __A,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_expand_epi32 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_expandsi128_maskz ((__v4si) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_expandloadu_epi32 (__m128i __W, __mmask8 __U, void const *__P)
+{
+ return (__m128i) __builtin_ia32_expandloadsi128_mask ((__v4si *) __P,
+ (__v4si) __W,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P)
+{
+ return (__m128i) __builtin_ia32_expandloadsi128_maskz ((__v4si *) __P,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutex2var_pd (__m256d __A, __m256i __I, __m256d __B)
+{
+ return (__m256d) __builtin_ia32_vpermt2varpd256_mask ((__v4di) __I
+ /* idx */ ,
+ (__v4df) __A,
+ (__v4df) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutex2var_pd (__m256d __A, __mmask8 __U, __m256i __I,
+ __m256d __B)
+{
+ return (__m256d) __builtin_ia32_vpermt2varpd256_mask ((__v4di) __I
+ /* idx */ ,
+ (__v4df) __A,
+ (__v4df) __B,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask2_permutex2var_pd (__m256d __A, __m256i __I, __mmask8 __U,
+ __m256d __B)
+{
+ return (__m256d) __builtin_ia32_vpermi2varpd256_mask ((__v4df) __A,
+ (__v4di) __I
+ /* idx */ ,
+ (__v4df) __B,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutex2var_pd (__mmask8 __U, __m256d __A, __m256i __I,
+ __m256d __B)
+{
+ return (__m256d) __builtin_ia32_vpermt2varpd256_maskz ((__v4di) __I
+ /* idx */ ,
+ (__v4df) __A,
+ (__v4df) __B,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutex2var_ps (__m256 __A, __m256i __I, __m256 __B)
+{
+ return (__m256) __builtin_ia32_vpermt2varps256_mask ((__v8si) __I
+ /* idx */ ,
+ (__v8sf) __A,
+ (__v8sf) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutex2var_ps (__m256 __A, __mmask8 __U, __m256i __I,
+ __m256 __B)
+{
+ return (__m256) __builtin_ia32_vpermt2varps256_mask ((__v8si) __I
+ /* idx */ ,
+ (__v8sf) __A,
+ (__v8sf) __B,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask2_permutex2var_ps (__m256 __A, __m256i __I, __mmask8 __U,
+ __m256 __B)
+{
+ return (__m256) __builtin_ia32_vpermi2varps256_mask ((__v8sf) __A,
+ (__v8si) __I
+ /* idx */ ,
+ (__v8sf) __B,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutex2var_ps (__mmask8 __U, __m256 __A, __m256i __I,
+ __m256 __B)
+{
+ return (__m256) __builtin_ia32_vpermt2varps256_maskz ((__v8si) __I
+ /* idx */ ,
+ (__v8sf) __A,
+ (__v8sf) __B,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutex2var_epi64 (__m128i __A, __m128i __I, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpermt2varq128_mask ((__v2di) __I
+ /* idx */ ,
+ (__v2di) __A,
+ (__v2di) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_permutex2var_epi64 (__m128i __A, __mmask8 __U, __m128i __I,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpermt2varq128_mask ((__v2di) __I
+ /* idx */ ,
+ (__v2di) __A,
+ (__v2di) __B,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask2_permutex2var_epi64 (__m128i __A, __m128i __I, __mmask8 __U,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpermi2varq128_mask ((__v2di) __A,
+ (__v2di) __I
+ /* idx */ ,
+ (__v2di) __B,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_permutex2var_epi64 (__mmask8 __U, __m128i __A, __m128i __I,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpermt2varq128_maskz ((__v2di) __I
+ /* idx */ ,
+ (__v2di) __A,
+ (__v2di) __B,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutex2var_epi32 (__m128i __A, __m128i __I, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpermt2vard128_mask ((__v4si) __I
+ /* idx */ ,
+ (__v4si) __A,
+ (__v4si) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_permutex2var_epi32 (__m128i __A, __mmask8 __U, __m128i __I,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpermt2vard128_mask ((__v4si) __I
+ /* idx */ ,
+ (__v4si) __A,
+ (__v4si) __B,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask2_permutex2var_epi32 (__m128i __A, __m128i __I, __mmask8 __U,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpermi2vard128_mask ((__v4si) __A,
+ (__v4si) __I
+ /* idx */ ,
+ (__v4si) __B,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_permutex2var_epi32 (__mmask8 __U, __m128i __A, __m128i __I,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpermt2vard128_maskz ((__v4si) __I
+ /* idx */ ,
+ (__v4si) __A,
+ (__v4si) __B,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutex2var_epi64 (__m256i __A, __m256i __I, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_vpermt2varq256_mask ((__v4di) __I
+ /* idx */ ,
+ (__v4di) __A,
+ (__v4di) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutex2var_epi64 (__m256i __A, __mmask8 __U, __m256i __I,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_vpermt2varq256_mask ((__v4di) __I
+ /* idx */ ,
+ (__v4di) __A,
+ (__v4di) __B,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask2_permutex2var_epi64 (__m256i __A, __m256i __I,
+ __mmask8 __U, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_vpermi2varq256_mask ((__v4di) __A,
+ (__v4di) __I
+ /* idx */ ,
+ (__v4di) __B,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutex2var_epi64 (__mmask8 __U, __m256i __A,
+ __m256i __I, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_vpermt2varq256_maskz ((__v4di) __I
+ /* idx */ ,
+ (__v4di) __A,
+ (__v4di) __B,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutex2var_epi32 (__m256i __A, __m256i __I, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_vpermt2vard256_mask ((__v8si) __I
+ /* idx */ ,
+ (__v8si) __A,
+ (__v8si) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutex2var_epi32 (__m256i __A, __mmask8 __U, __m256i __I,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_vpermt2vard256_mask ((__v8si) __I
+ /* idx */ ,
+ (__v8si) __A,
+ (__v8si) __B,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask2_permutex2var_epi32 (__m256i __A, __m256i __I,
+ __mmask8 __U, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_vpermi2vard256_mask ((__v8si) __A,
+ (__v8si) __I
+ /* idx */ ,
+ (__v8si) __B,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutex2var_epi32 (__mmask8 __U, __m256i __A,
+ __m256i __I, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_vpermt2vard256_maskz ((__v8si) __I
+ /* idx */ ,
+ (__v8si) __A,
+ (__v8si) __B,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutex2var_pd (__m128d __A, __m128i __I, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_vpermt2varpd128_mask ((__v2di) __I
+ /* idx */ ,
+ (__v2df) __A,
+ (__v2df) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_permutex2var_pd (__m128d __A, __mmask8 __U, __m128i __I,
+ __m128d __B)
+{
+ return (__m128d) __builtin_ia32_vpermt2varpd128_mask ((__v2di) __I
+ /* idx */ ,
+ (__v2df) __A,
+ (__v2df) __B,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask2_permutex2var_pd (__m128d __A, __m128i __I, __mmask8 __U,
+ __m128d __B)
+{
+ return (__m128d) __builtin_ia32_vpermi2varpd128_mask ((__v2df) __A,
+ (__v2di) __I
+ /* idx */ ,
+ (__v2df) __B,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_permutex2var_pd (__mmask8 __U, __m128d __A, __m128i __I,
+ __m128d __B)
+{
+ return (__m128d) __builtin_ia32_vpermt2varpd128_maskz ((__v2di) __I
+ /* idx */ ,
+ (__v2df) __A,
+ (__v2df) __B,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutex2var_ps (__m128 __A, __m128i __I, __m128 __B)
+{
+ return (__m128) __builtin_ia32_vpermt2varps128_mask ((__v4si) __I
+ /* idx */ ,
+ (__v4sf) __A,
+ (__v4sf) __B,
+ (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_permutex2var_ps (__m128 __A, __mmask8 __U, __m128i __I,
+ __m128 __B)
+{
+ return (__m128) __builtin_ia32_vpermt2varps128_mask ((__v4si) __I
+ /* idx */ ,
+ (__v4sf) __A,
+ (__v4sf) __B,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask2_permutex2var_ps (__m128 __A, __m128i __I, __mmask8 __U,
+ __m128 __B)
+{
+ return (__m128) __builtin_ia32_vpermi2varps128_mask ((__v4sf) __A,
+ (__v4si) __I
+ /* idx */ ,
+ (__v4sf) __B,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_permutex2var_ps (__mmask8 __U, __m128 __A, __m128i __I,
+ __m128 __B)
+{
+ return (__m128) __builtin_ia32_vpermt2varps128_maskz ((__v4si) __I
+ /* idx */ ,
+ (__v4sf) __A,
+ (__v4sf) __B,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srav_epi64 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X,
+ (__v2di) __Y,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_srav_epi64 (__m128i __W, __mmask8 __U, __m128i __X,
+ __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X,
+ (__v2di) __Y,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_srav_epi64 (__mmask8 __U, __m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X,
+ (__v2di) __Y,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sllv_epi32 (__m256i __W, __mmask8 __U, __m256i __X,
+ __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_psllv8si_mask ((__v8si) __X,
+ (__v8si) __Y,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sllv_epi32 (__mmask8 __U, __m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_psllv8si_mask ((__v8si) __X,
+ (__v8si) __Y,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sllv_epi32 (__m128i __W, __mmask8 __U, __m128i __X,
+ __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_psllv4si_mask ((__v4si) __X,
+ (__v4si) __Y,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sllv_epi32 (__mmask8 __U, __m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_psllv4si_mask ((__v4si) __X,
+ (__v4si) __Y,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sllv_epi64 (__m256i __W, __mmask8 __U, __m256i __X,
+ __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_psllv4di_mask ((__v4di) __X,
+ (__v4di) __Y,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sllv_epi64 (__mmask8 __U, __m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_psllv4di_mask ((__v4di) __X,
+ (__v4di) __Y,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sllv_epi64 (__m128i __W, __mmask8 __U, __m128i __X,
+ __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_psllv2di_mask ((__v2di) __X,
+ (__v2di) __Y,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sllv_epi64 (__mmask8 __U, __m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_psllv2di_mask ((__v2di) __X,
+ (__v2di) __Y,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_srav_epi32 (__m256i __W, __mmask8 __U, __m256i __X,
+ __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_psrav8si_mask ((__v8si) __X,
+ (__v8si) __Y,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_srav_epi32 (__mmask8 __U, __m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_psrav8si_mask ((__v8si) __X,
+ (__v8si) __Y,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_srav_epi32 (__m128i __W, __mmask8 __U, __m128i __X,
+ __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_psrav4si_mask ((__v4si) __X,
+ (__v4si) __Y,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_srav_epi32 (__mmask8 __U, __m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_psrav4si_mask ((__v4si) __X,
+ (__v4si) __Y,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_srlv_epi32 (__m256i __W, __mmask8 __U, __m256i __X,
+ __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_psrlv8si_mask ((__v8si) __X,
+ (__v8si) __Y,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_srlv_epi32 (__mmask8 __U, __m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_psrlv8si_mask ((__v8si) __X,
+ (__v8si) __Y,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_srlv_epi32 (__m128i __W, __mmask8 __U, __m128i __X,
+ __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_psrlv4si_mask ((__v4si) __X,
+ (__v4si) __Y,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_srlv_epi32 (__mmask8 __U, __m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_psrlv4si_mask ((__v4si) __X,
+ (__v4si) __Y,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_srlv_epi64 (__m256i __W, __mmask8 __U, __m256i __X,
+ __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_psrlv4di_mask ((__v4di) __X,
+ (__v4di) __Y,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_srlv_epi64 (__mmask8 __U, __m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_psrlv4di_mask ((__v4di) __X,
+ (__v4di) __Y,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_srlv_epi64 (__m128i __W, __mmask8 __U, __m128i __X,
+ __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_psrlv2di_mask ((__v2di) __X,
+ (__v2di) __Y,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_srlv_epi64 (__mmask8 __U, __m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_psrlv2di_mask ((__v2di) __X,
+ (__v2di) __Y,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rolv_epi32 (__m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_rolv_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_rolv_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rolv_epi32 (__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rolv_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rolv_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rorv_epi32 (__m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_rorv_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_rorv_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rorv_epi32 (__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rorv_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rorv_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rolv_epi64 (__m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_rolv_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_rolv_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rolv_epi64 (__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rolv_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rolv_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rorv_epi64 (__m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_rorv_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_rorv_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rorv_epi64 (__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rorv_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rorv_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srav_epi64 (__m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X,
+ (__v4di) __Y,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_srav_epi64 (__m256i __W, __mmask8 __U, __m256i __X,
+ __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X,
+ (__v4di) __Y,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_srav_epi64 (__mmask8 __U, __m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X,
+ (__v4di) __Y,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_and_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pandq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di) __W, __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_and_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pandq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di)
+ _mm256_setzero_pd (),
+ __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_and_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pandq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di) __W, __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_and_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pandq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di)
+ _mm_setzero_pd (),
+ __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_andnot_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pandnq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di) __W, __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_andnot_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pandnq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di)
+ _mm256_setzero_pd (),
+ __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_andnot_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pandnq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di) __W, __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_andnot_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pandnq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di)
+ _mm_setzero_pd (),
+ __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_or_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_porq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_or_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_porq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_or_epi64 (__m256i __A, __m256i __B)
+{
+ return (__m256i) ((__v4du)__A | (__v4du)__B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_or_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_porq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_or_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_porq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_or_epi64 (__m128i __A, __m128i __B)
+{
+ return (__m128i) ((__v2du)__A | (__v2du)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_xor_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pxorq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_xor_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pxorq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_xor_epi64 (__m256i __A, __m256i __B)
+{
+ return (__m256i) ((__v4du)__A ^ (__v4du)__B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_xor_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pxorq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_xor_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pxorq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_xor_epi64 (__m128i __A, __m128i __B)
+{
+ return (__m128i) ((__v2du)__A ^ (__v2du)__B);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_max_pd (__m256d __W, __mmask8 __U, __m256d __A,
+ __m256d __B)
+{
+ return (__m256d) __builtin_ia32_maxpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_max_pd (__mmask8 __U, __m256d __A, __m256d __B)
+{
+ return (__m256d) __builtin_ia32_maxpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_max_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
+{
+ return (__m256) __builtin_ia32_maxps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_max_ps (__mmask8 __U, __m256 __A, __m256 __B)
+{
+ return (__m256) __builtin_ia32_maxps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_div_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_divps_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_div_ps (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_divps_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_div_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_divpd_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_div_pd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_divpd_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_min_pd (__m256d __W, __mmask8 __U, __m256d __A,
+ __m256d __B)
+{
+ return (__m256d) __builtin_ia32_minpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_div_pd (__m256d __W, __mmask8 __U, __m256d __A,
+ __m256d __B)
+{
+ return (__m256d) __builtin_ia32_divpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_min_pd (__mmask8 __U, __m256d __A, __m256d __B)
+{
+ return (__m256d) __builtin_ia32_minpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_min_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
+{
+ return (__m256) __builtin_ia32_minps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_div_pd (__mmask8 __U, __m256d __A, __m256d __B)
+{
+ return (__m256d) __builtin_ia32_divpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_div_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
+{
+ return (__m256) __builtin_ia32_divps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_min_ps (__mmask8 __U, __m256 __A, __m256 __B)
+{
+ return (__m256) __builtin_ia32_minps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_div_ps (__mmask8 __U, __m256 __A, __m256 __B)
+{
+ return (__m256) __builtin_ia32_divps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_minps_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mul_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_mulps_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_ps (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_minps_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mul_ps (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_mulps_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_maxps_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_ps (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_maxps_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_minpd_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_pd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_minpd_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_maxpd_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_pd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_maxpd_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mul_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_mulpd_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mul_pd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_mulpd_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mul_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
+{
+ return (__m256) __builtin_ia32_mulps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mul_ps (__mmask8 __U, __m256 __A, __m256 __B)
+{
+ return (__m256) __builtin_ia32_mulps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mul_pd (__m256d __W, __mmask8 __U, __m256d __A,
+ __m256d __B)
+{
+ return (__m256d) __builtin_ia32_mulpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mul_pd (__mmask8 __U, __m256d __A, __m256d __B)
+{
+ return (__m256d) __builtin_ia32_mulpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_max_epi64 (__mmask8 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_max_epi64 (__m256i __W, __mmask8 __M, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di) __W, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_epi64 (__m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_min_epi64 (__m256i __W, __mmask8 __M, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di) __W, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_min_epi64 (__mmask8 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_max_epu64 (__mmask8 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_epi64 (__m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_epu64 (__m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_max_epu64 (__m256i __W, __mmask8 __M, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di) __W, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_epu64 (__m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_min_epu64 (__m256i __W, __mmask8 __M, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di) __W, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_min_epu64 (__mmask8 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_max_epi32 (__mmask8 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pmaxsd256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_max_epi32 (__m256i __W, __mmask8 __M, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pmaxsd256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si) __W, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_min_epi32 (__mmask8 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pminsd256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_min_epi32 (__m256i __W, __mmask8 __M, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pminsd256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si) __W, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_max_epu32 (__mmask8 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pmaxud256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_max_epu32 (__m256i __W, __mmask8 __M, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pmaxud256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si) __W, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_min_epu32 (__mmask8 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pminud256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_min_epu32 (__m256i __W, __mmask8 __M, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pminud256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si) __W, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_epi64 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_epi64 (__m128i __W, __mmask8 __M, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di) __W, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epi64 (__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_epi64 (__m128i __W, __mmask8 __M, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di) __W, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_epi64 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_epu64 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epi64 (__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epu64 (__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_epu64 (__m128i __W, __mmask8 __M, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di) __W, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epu64 (__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_epu64 (__m128i __W, __mmask8 __M, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di) __W, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_epu64 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pmaxsd128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_epi32 (__m128i __W, __mmask8 __M, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pmaxsd128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si) __W, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pminsd128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_epi32 (__m128i __W, __mmask8 __M, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pminsd128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si) __W, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_epu32 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pmaxud128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_epu32 (__m128i __W, __mmask8 __M, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pmaxud128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si) __W, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_epu32 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pminud128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_epu32 (__m128i __W, __mmask8 __M, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pminud128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si) __W, __M);
+}
+
+#ifndef __AVX512CD__
+#pragma GCC push_options
+#pragma GCC target("avx512vl,avx512cd")
+#define __DISABLE_AVX512VLCD__
+#endif
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcastmb_epi64 (__mmask8 __A)
+{
+ return (__m128i) __builtin_ia32_broadcastmb128 (__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastmb_epi64 (__mmask8 __A)
+{
+ return (__m256i) __builtin_ia32_broadcastmb256 (__A);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcastmw_epi32 (__mmask16 __A)
+{
+ return (__m128i) __builtin_ia32_broadcastmw128 (__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastmw_epi32 (__mmask16 __A)
+{
+ return (__m256i) __builtin_ia32_broadcastmw256 (__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_lzcnt_epi32 (__m256i __A)
+{
+ return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_lzcnt_epi64 (__m256i __A)
+{
+ return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_conflict_epi64 (__m256i __A)
+{
+ return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
+ (__v4di) __W,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_conflict_epi32 (__m256i __A)
+{
+ return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
+ (__v8si) __W,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_lzcnt_epi32 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_lzcnt_epi64 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_conflict_epi64 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
+ (__v2di) __W,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_conflict_epi32 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
+ (__v4si) __W,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8)
+ __U);
+}
+
+#ifdef __DISABLE_AVX512VLCD__
+#pragma GCC pop_options
+#endif
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_unpacklo_pd (__m256d __W, __mmask8 __U, __m256d __A,
+ __m256d __B)
+{
+ return (__m256d) __builtin_ia32_unpcklpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_unpacklo_pd (__mmask8 __U, __m256d __A, __m256d __B)
+{
+ return (__m256d) __builtin_ia32_unpcklpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_unpacklo_pd (__m128d __W, __mmask8 __U, __m128d __A,
+ __m128d __B)
+{
+ return (__m128d) __builtin_ia32_unpcklpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_unpacklo_pd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_unpcklpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_unpacklo_ps (__m256 __W, __mmask8 __U, __m256 __A,
+ __m256 __B)
+{
+ return (__m256) __builtin_ia32_unpcklps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_unpackhi_pd (__m256d __W, __mmask8 __U, __m256d __A,
+ __m256d __B)
+{
+ return (__m256d) __builtin_ia32_unpckhpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_unpackhi_pd (__mmask8 __U, __m256d __A, __m256d __B)
+{
+ return (__m256d) __builtin_ia32_unpckhpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_unpackhi_pd (__m128d __W, __mmask8 __U, __m128d __A,
+ __m128d __B)
+{
+ return (__m128d) __builtin_ia32_unpckhpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_unpackhi_pd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_unpckhpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_unpackhi_ps (__m256 __W, __mmask8 __U, __m256 __A,
+ __m256 __B)
+{
+ return (__m256) __builtin_ia32_unpckhps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_unpackhi_ps (__mmask8 __U, __m256 __A, __m256 __B)
+{
+ return (__m256) __builtin_ia32_unpckhps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_unpackhi_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_unpckhps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_unpackhi_ps (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_unpckhps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtph_ps (__m128 __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtph_ps (__mmask8 __U, __m128i __A)
+{
+ return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_unpacklo_ps (__mmask8 __U, __m256 __A, __m256 __B)
+{
+ return (__m256) __builtin_ia32_unpcklps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtph_ps (__m256 __W, __mmask8 __U, __m128i __A)
+{
+ return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtph_ps (__mmask8 __U, __m128i __A)
+{
+ return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_unpacklo_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_unpcklps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_unpacklo_ps (__mmask8 __U, __m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_unpcklps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sra_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m128i __B)
+{
+ return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A,
+ (__v4si) __B,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sra_epi32 (__mmask8 __U, __m256i __A, __m128i __B)
+{
+ return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A,
+ (__v4si) __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sra_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sra_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sra_epi64 (__m256i __A, __m128i __B)
+{
+ return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
+ (__v2di) __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sra_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m128i __B)
+{
+ return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
+ (__v2di) __B,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sra_epi64 (__mmask8 __U, __m256i __A, __m128i __B)
+{
+ return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
+ (__v2di) __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sra_epi64 (__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sra_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sra_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sll_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sll_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sll_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sll_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sll_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m128i __B)
+{
+ return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A,
+ (__v4si) __B,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sll_epi32 (__mmask8 __U, __m256i __A, __m128i __B)
+{
+ return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A,
+ (__v4si) __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sll_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m128i __B)
+{
+ return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A,
+ (__v2di) __B,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sll_epi64 (__mmask8 __U, __m256i __A, __m128i __B)
+{
+ return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A,
+ (__v2di) __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutexvar_ps (__m256 __W, __mmask8 __U, __m256i __X,
+ __m256 __Y)
+{
+ return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y,
+ (__v8si) __X,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutexvar_ps (__mmask8 __U, __m256i __X, __m256 __Y)
+{
+ return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y,
+ (__v8si) __X,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutexvar_pd (__m256i __X, __m256d __Y)
+{
+ return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
+ (__v4di) __X,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutexvar_pd (__m256d __W, __mmask8 __U, __m256i __X,
+ __m256d __Y)
+{
+ return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
+ (__v4di) __X,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutexvar_pd (__mmask8 __U, __m256i __X, __m256d __Y)
+{
+ return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
+ (__v4di) __X,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutevar_pd (__m256d __W, __mmask8 __U, __m256d __A,
+ __m256i __C)
+{
+ return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A,
+ (__v4di) __C,
+ (__v4df) __W,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutevar_pd (__mmask8 __U, __m256d __A, __m256i __C)
+{
+ return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A,
+ (__v4di) __C,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutevar_ps (__m256 __W, __mmask8 __U, __m256 __A,
+ __m256i __C)
+{
+ return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A,
+ (__v8si) __C,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutevar_ps (__mmask8 __U, __m256 __A, __m256i __C)
+{
+ return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A,
+ (__v8si) __C,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_permutevar_pd (__m128d __W, __mmask8 __U, __m128d __A,
+ __m128i __C)
+{
+ return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A,
+ (__v2di) __C,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_permutevar_pd (__mmask8 __U, __m128d __A, __m128i __C)
+{
+ return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A,
+ (__v2di) __C,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_permutevar_ps (__m128 __W, __mmask8 __U, __m128 __A,
+ __m128i __C)
+{
+ return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A,
+ (__v4si) __C,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_permutevar_ps (__mmask8 __U, __m128 __A, __m128i __C)
+{
+ return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A,
+ (__v4si) __C,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mullo_epi32 (__mmask8 __M, __m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutexvar_epi64 (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
+ (__v4di) __X,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mullo_epi32 (__m256i __W, __mmask8 __M, __m256i __A,
+ __m256i __B)
+{
+ return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si) __W, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mullo_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mullo_epi32 (__m128i __W, __mmask8 __M, __m128i __A,
+ __m128i __B)
+{
+ return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si) __W, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mul_epi32 (__m256i __W, __mmask8 __M, __m256i __X,
+ __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X,
+ (__v8si) __Y,
+ (__v4di) __W, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mul_epi32 (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X,
+ (__v8si) __Y,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mul_epi32 (__m128i __W, __mmask8 __M, __m128i __X,
+ __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X,
+ (__v4si) __Y,
+ (__v2di) __W, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mul_epi32 (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X,
+ (__v4si) __Y,
+ (__v2di)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutexvar_epi64 (__m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
+ (__v4di) __X,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutexvar_epi64 (__m256i __W, __mmask8 __M, __m256i __X,
+ __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
+ (__v4di) __X,
+ (__v4di) __W,
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mul_epu32 (__m256i __W, __mmask8 __M, __m256i __X,
+ __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X,
+ (__v8si) __Y,
+ (__v4di) __W, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutexvar_epi32 (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
+ (__v8si) __X,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mul_epu32 (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X,
+ (__v8si) __Y,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mul_epu32 (__m128i __W, __mmask8 __M, __m128i __X,
+ __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X,
+ (__v4si) __Y,
+ (__v2di) __W, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mul_epu32 (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X,
+ (__v4si) __Y,
+ (__v2di)
+ _mm_setzero_si128 (),
+ __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutexvar_epi32 (__m256i __X, __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
+ (__v8si) __X,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutexvar_epi32 (__m256i __W, __mmask8 __M, __m256i __X,
+ __m256i __Y)
+{
+ return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
+ (__v8si) __X,
+ (__v8si) __W,
+ __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpneq_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+ (__v8si) __Y, 4,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpneq_epu32_mask (__m256i __X, __m256i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+ (__v8si) __Y, 4,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmplt_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+ (__v8si) __Y, 1,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmplt_epu32_mask (__m256i __X, __m256i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+ (__v8si) __Y, 1,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpge_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+ (__v8si) __Y, 5,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpge_epu32_mask (__m256i __X, __m256i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+ (__v8si) __Y, 5,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmple_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+ (__v8si) __Y, 2,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmple_epu32_mask (__m256i __X, __m256i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+ (__v8si) __Y, 2,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpneq_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+ (__v4di) __Y, 4,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpneq_epu64_mask (__m256i __X, __m256i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+ (__v4di) __Y, 4,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmplt_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+ (__v4di) __Y, 1,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmplt_epu64_mask (__m256i __X, __m256i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+ (__v4di) __Y, 1,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpge_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+ (__v4di) __Y, 5,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpge_epu64_mask (__m256i __X, __m256i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+ (__v4di) __Y, 5,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmple_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+ (__v4di) __Y, 2,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmple_epu64_mask (__m256i __X, __m256i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+ (__v4di) __Y, 2,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpneq_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+ (__v8si) __Y, 4,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpneq_epi32_mask (__m256i __X, __m256i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+ (__v8si) __Y, 4,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmplt_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+ (__v8si) __Y, 1,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmplt_epi32_mask (__m256i __X, __m256i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+ (__v8si) __Y, 1,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpge_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+ (__v8si) __Y, 5,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpge_epi32_mask (__m256i __X, __m256i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+ (__v8si) __Y, 5,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmple_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+ (__v8si) __Y, 2,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmple_epi32_mask (__m256i __X, __m256i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+ (__v8si) __Y, 2,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpneq_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+ (__v4di) __Y, 4,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpneq_epi64_mask (__m256i __X, __m256i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+ (__v4di) __Y, 4,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmplt_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+ (__v4di) __Y, 1,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmplt_epi64_mask (__m256i __X, __m256i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+ (__v4di) __Y, 1,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpge_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+ (__v4di) __Y, 5,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpge_epi64_mask (__m256i __X, __m256i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+ (__v4di) __Y, 5,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmple_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+ (__v4di) __Y, 2,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmple_epi64_mask (__m256i __X, __m256i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+ (__v4di) __Y, 2,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpneq_epu32_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
+ (__v4si) __Y, 4,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_epu32_mask (__m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
+ (__v4si) __Y, 4,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmplt_epu32_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
+ (__v4si) __Y, 1,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epu32_mask (__m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
+ (__v4si) __Y, 1,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpge_epu32_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
+ (__v4si) __Y, 5,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_epu32_mask (__m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
+ (__v4si) __Y, 5,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmple_epu32_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
+ (__v4si) __Y, 2,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_epu32_mask (__m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
+ (__v4si) __Y, 2,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpneq_epu64_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
+ (__v2di) __Y, 4,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_epu64_mask (__m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
+ (__v2di) __Y, 4,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmplt_epu64_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
+ (__v2di) __Y, 1,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epu64_mask (__m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
+ (__v2di) __Y, 1,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpge_epu64_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
+ (__v2di) __Y, 5,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_epu64_mask (__m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
+ (__v2di) __Y, 5,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmple_epu64_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
+ (__v2di) __Y, 2,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_epu64_mask (__m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
+ (__v2di) __Y, 2,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpneq_epi32_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
+ (__v4si) __Y, 4,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_epi32_mask (__m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
+ (__v4si) __Y, 4,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmplt_epi32_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
+ (__v4si) __Y, 1,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epi32_mask (__m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
+ (__v4si) __Y, 1,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpge_epi32_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
+ (__v4si) __Y, 5,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_epi32_mask (__m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
+ (__v4si) __Y, 5,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmple_epi32_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
+ (__v4si) __Y, 2,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_epi32_mask (__m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
+ (__v4si) __Y, 2,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpneq_epi64_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
+ (__v2di) __Y, 4,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_epi64_mask (__m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
+ (__v2di) __Y, 4,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmplt_epi64_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
+ (__v2di) __Y, 1,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epi64_mask (__m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
+ (__v2di) __Y, 1,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpge_epi64_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
+ (__v2di) __Y, 5,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_epi64_mask (__m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
+ (__v2di) __Y, 5,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmple_epi64_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
+ (__v2di) __Y, 2,
+ (__mmask8) __M);
+}
+
+extern __inline __mmask8
+ __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_epi64_mask (__m128i __X, __m128i __Y)
+{
+ return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
+ (__v2di) __Y, 2,
+ (__mmask8) -1);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutex_epi64 (__m256i __X, const int __I)
+{
+ return (__m256i) __builtin_ia32_permdi256_mask ((__v4di) __X,
+ __I,
+ (__v4di)
+ _mm256_setzero_si256(),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutex_epi64 (__m256i __W, __mmask8 __M,
+ __m256i __X, const int __I)
+{
+ return (__m256i) __builtin_ia32_permdi256_mask ((__v4di) __X,
+ __I,
+ (__v4di) __W,
+ (__mmask8) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutex_epi64 (__mmask8 __M, __m256i __X, const int __I)
+{
+ return (__m256i) __builtin_ia32_permdi256_mask ((__v4di) __X,
+ __I,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __M);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shuffle_pd (__m256d __W, __mmask8 __U, __m256d __A,
+ __m256d __B, const int __imm)
+{
+ return (__m256d) __builtin_ia32_shufpd256_mask ((__v4df) __A,
+ (__v4df) __B, __imm,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shuffle_pd (__mmask8 __U, __m256d __A, __m256d __B,
+ const int __imm)
+{
+ return (__m256d) __builtin_ia32_shufpd256_mask ((__v4df) __A,
+ (__v4df) __B, __imm,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shuffle_pd (__m128d __W, __mmask8 __U, __m128d __A,
+ __m128d __B, const int __imm)
+{
+ return (__m128d) __builtin_ia32_shufpd128_mask ((__v2df) __A,
+ (__v2df) __B, __imm,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shuffle_pd (__mmask8 __U, __m128d __A, __m128d __B,
+ const int __imm)
+{
+ return (__m128d) __builtin_ia32_shufpd128_mask ((__v2df) __A,
+ (__v2df) __B, __imm,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shuffle_ps (__m256 __W, __mmask8 __U, __m256 __A,
+ __m256 __B, const int __imm)
+{
+ return (__m256) __builtin_ia32_shufps256_mask ((__v8sf) __A,
+ (__v8sf) __B, __imm,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shuffle_ps (__mmask8 __U, __m256 __A, __m256 __B,
+ const int __imm)
+{
+ return (__m256) __builtin_ia32_shufps256_mask ((__v8sf) __A,
+ (__v8sf) __B, __imm,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shuffle_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B,
+ const int __imm)
+{
+ return (__m128) __builtin_ia32_shufps128_mask ((__v4sf) __A,
+ (__v4sf) __B, __imm,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shuffle_ps (__mmask8 __U, __m128 __A, __m128 __B,
+ const int __imm)
+{
+ return (__m128) __builtin_ia32_shufps128_mask ((__v4sf) __A,
+ (__v4sf) __B, __imm,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_inserti32x4 (__m256i __A, __m128i __B, const int __imm)
+{
+ return (__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si) __A,
+ (__v4si) __B,
+ __imm,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_inserti32x4 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m128i __B, const int __imm)
+{
+ return (__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si) __A,
+ (__v4si) __B,
+ __imm,
+ (__v8si) __W,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_inserti32x4 (__mmask8 __U, __m256i __A, __m128i __B,
+ const int __imm)
+{
+ return (__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si) __A,
+ (__v4si) __B,
+ __imm,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_insertf32x4 (__m256 __A, __m128 __B, const int __imm)
+{
+ return (__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf) __A,
+ (__v4sf) __B,
+ __imm,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_insertf32x4 (__m256 __W, __mmask8 __U, __m256 __A,
+ __m128 __B, const int __imm)
+{
+ return (__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf) __A,
+ (__v4sf) __B,
+ __imm,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_insertf32x4 (__mmask8 __U, __m256 __A, __m128 __B,
+ const int __imm)
+{
+ return (__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf) __A,
+ (__v4sf) __B,
+ __imm,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extracti32x4_epi32 (__m256i __A, const int __imm)
+{
+ return (__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si) __A,
+ __imm,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_extracti32x4_epi32 (__m128i __W, __mmask8 __U, __m256i __A,
+ const int __imm)
+{
+ return (__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si) __A,
+ __imm,
+ (__v4si) __W,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_extracti32x4_epi32 (__mmask8 __U, __m256i __A,
+ const int __imm)
+{
+ return (__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si) __A,
+ __imm,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extractf32x4_ps (__m256 __A, const int __imm)
+{
+ return (__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf) __A,
+ __imm,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_extractf32x4_ps (__m128 __W, __mmask8 __U, __m256 __A,
+ const int __imm)
+{
+ return (__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf) __A,
+ __imm,
+ (__v4sf) __W,
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_extractf32x4_ps (__mmask8 __U, __m256 __A,
+ const int __imm)
+{
+ return (__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf) __A,
+ __imm,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8)
+ __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shuffle_i64x2 (__m256i __A, __m256i __B, const int __imm)
+{
+ return (__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di) __A,
+ (__v4di) __B,
+ __imm,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shuffle_i64x2 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B, const int __imm)
+{
+ return (__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di) __A,
+ (__v4di) __B,
+ __imm,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shuffle_i64x2 (__mmask8 __U, __m256i __A, __m256i __B,
+ const int __imm)
+{
+ return (__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di) __A,
+ (__v4di) __B,
+ __imm,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shuffle_i32x4 (__m256i __A, __m256i __B, const int __imm)
+{
+ return (__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si) __A,
+ (__v8si) __B,
+ __imm,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shuffle_i32x4 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B, const int __imm)
+{
+ return (__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si) __A,
+ (__v8si) __B,
+ __imm,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shuffle_i32x4 (__mmask8 __U, __m256i __A, __m256i __B,
+ const int __imm)
+{
+ return (__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si) __A,
+ (__v8si) __B,
+ __imm,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shuffle_f64x2 (__m256d __A, __m256d __B, const int __imm)
+{
+ return (__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df) __A,
+ (__v4df) __B,
+ __imm,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shuffle_f64x2 (__m256d __W, __mmask8 __U, __m256d __A,
+ __m256d __B, const int __imm)
+{
+ return (__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df) __A,
+ (__v4df) __B,
+ __imm,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shuffle_f64x2 (__mmask8 __U, __m256d __A, __m256d __B,
+ const int __imm)
+{
+ return (__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df) __A,
+ (__v4df) __B,
+ __imm,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shuffle_f32x4 (__m256 __A, __m256 __B, const int __imm)
+{
+ return (__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ __imm,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shuffle_f32x4 (__m256 __W, __mmask8 __U, __m256 __A,
+ __m256 __B, const int __imm)
+{
+ return (__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ __imm,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shuffle_f32x4 (__mmask8 __U, __m256 __A, __m256 __B,
+ const int __imm)
+{
+ return (__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ __imm,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fixupimm_pd (__m256d __A, __m256d __B, __m256i __C,
+ const int __imm)
+{
+ return (__m256d) __builtin_ia32_fixupimmpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4di) __C,
+ __imm,
+ (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fixupimm_pd (__m256d __A, __mmask8 __U, __m256d __B,
+ __m256i __C, const int __imm)
+{
+ return (__m256d) __builtin_ia32_fixupimmpd256_mask ((__v4df) __A,
+ (__v4df) __B,
+ (__v4di) __C,
+ __imm,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fixupimm_pd (__mmask8 __U, __m256d __A, __m256d __B,
+ __m256i __C, const int __imm)
+{
+ return (__m256d) __builtin_ia32_fixupimmpd256_maskz ((__v4df) __A,
+ (__v4df) __B,
+ (__v4di) __C,
+ __imm,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fixupimm_ps (__m256 __A, __m256 __B, __m256i __C,
+ const int __imm)
+{
+ return (__m256) __builtin_ia32_fixupimmps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8si) __C,
+ __imm,
+ (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fixupimm_ps (__m256 __A, __mmask8 __U, __m256 __B,
+ __m256i __C, const int __imm)
+{
+ return (__m256) __builtin_ia32_fixupimmps256_mask ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8si) __C,
+ __imm,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fixupimm_ps (__mmask8 __U, __m256 __A, __m256 __B,
+ __m256i __C, const int __imm)
+{
+ return (__m256) __builtin_ia32_fixupimmps256_maskz ((__v8sf) __A,
+ (__v8sf) __B,
+ (__v8si) __C,
+ __imm,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fixupimm_pd (__m128d __A, __m128d __B, __m128i __C,
+ const int __imm)
+{
+ return (__m128d) __builtin_ia32_fixupimmpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2di) __C,
+ __imm,
+ (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fixupimm_pd (__m128d __A, __mmask8 __U, __m128d __B,
+ __m128i __C, const int __imm)
+{
+ return (__m128d) __builtin_ia32_fixupimmpd128_mask ((__v2df) __A,
+ (__v2df) __B,
+ (__v2di) __C,
+ __imm,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fixupimm_pd (__mmask8 __U, __m128d __A, __m128d __B,
+ __m128i __C, const int __imm)
+{
+ return (__m128d) __builtin_ia32_fixupimmpd128_maskz ((__v2df) __A,
+ (__v2df) __B,
+ (__v2di) __C,
+ __imm,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fixupimm_ps (__m128 __A, __m128 __B, __m128i __C, const int __imm)
+{
+ return (__m128) __builtin_ia32_fixupimmps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4si) __C,
+ __imm,
+ (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fixupimm_ps (__m128 __A, __mmask8 __U, __m128 __B,
+ __m128i __C, const int __imm)
+{
+ return (__m128) __builtin_ia32_fixupimmps128_mask ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4si) __C,
+ __imm,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fixupimm_ps (__mmask8 __U, __m128 __A, __m128 __B,
+ __m128i __C, const int __imm)
+{
+ return (__m128) __builtin_ia32_fixupimmps128_maskz ((__v4sf) __A,
+ (__v4sf) __B,
+ (__v4si) __C,
+ __imm,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_srli_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+ const int __imm)
+{
+ return (__m256i) __builtin_ia32_psrldi256_mask ((__v8si) __A, __imm,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_srli_epi32 (__mmask8 __U, __m256i __A, const int __imm)
+{
+ return (__m256i) __builtin_ia32_psrldi256_mask ((__v8si) __A, __imm,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_srli_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+ const int __imm)
+{
+ return (__m128i) __builtin_ia32_psrldi128_mask ((__v4si) __A, __imm,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_srli_epi32 (__mmask8 __U, __m128i __A, const int __imm)
+{
+ return (__m128i) __builtin_ia32_psrldi128_mask ((__v4si) __A, __imm,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_srli_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+ const int __imm)
+{
+ return (__m256i) __builtin_ia32_psrlqi256_mask ((__v4di) __A, __imm,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_srli_epi64 (__mmask8 __U, __m256i __A, const int __imm)
+{
+ return (__m256i) __builtin_ia32_psrlqi256_mask ((__v4di) __A, __imm,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_srli_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+ const int __imm)
+{
+ return (__m128i) __builtin_ia32_psrlqi128_mask ((__v2di) __A, __imm,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_srli_epi64 (__mmask8 __U, __m128i __A, const int __imm)
+{
+ return (__m128i) __builtin_ia32_psrlqi128_mask ((__v2di) __A, __imm,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_ternarylogic_epi64 (__m256i __A, __m256i __B, __m256i __C,
+ const int __imm)
+{
+ return (__m256i)
+ __builtin_ia32_pternlogq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di) __C,
+ (unsigned char) __imm,
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_ternarylogic_epi64 (__m256i __A, __mmask8 __U,
+ __m256i __B, __m256i __C,
+ const int __imm)
+{
+ return (__m256i)
+ __builtin_ia32_pternlogq256_mask ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di) __C,
+ (unsigned char) __imm,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_ternarylogic_epi64 (__mmask8 __U, __m256i __A,
+ __m256i __B, __m256i __C,
+ const int __imm)
+{
+ return (__m256i)
+ __builtin_ia32_pternlogq256_maskz ((__v4di) __A,
+ (__v4di) __B,
+ (__v4di) __C,
+ (unsigned char) __imm,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_ternarylogic_epi32 (__m256i __A, __m256i __B, __m256i __C,
+ const int __imm)
+{
+ return (__m256i)
+ __builtin_ia32_pternlogd256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si) __C,
+ (unsigned char) __imm,
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_ternarylogic_epi32 (__m256i __A, __mmask8 __U,
+ __m256i __B, __m256i __C,
+ const int __imm)
+{
+ return (__m256i)
+ __builtin_ia32_pternlogd256_mask ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si) __C,
+ (unsigned char) __imm,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_ternarylogic_epi32 (__mmask8 __U, __m256i __A,
+ __m256i __B, __m256i __C,
+ const int __imm)
+{
+ return (__m256i)
+ __builtin_ia32_pternlogd256_maskz ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si) __C,
+ (unsigned char) __imm,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ternarylogic_epi64 (__m128i __A, __m128i __B, __m128i __C,
+ const int __imm)
+{
+ return (__m128i)
+ __builtin_ia32_pternlogq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di) __C,
+ (unsigned char) __imm,
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_ternarylogic_epi64 (__m128i __A, __mmask8 __U,
+ __m128i __B, __m128i __C,
+ const int __imm)
+{
+ return (__m128i)
+ __builtin_ia32_pternlogq128_mask ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di) __C,
+ (unsigned char) __imm,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_ternarylogic_epi64 (__mmask8 __U, __m128i __A,
+ __m128i __B, __m128i __C,
+ const int __imm)
+{
+ return (__m128i)
+ __builtin_ia32_pternlogq128_maskz ((__v2di) __A,
+ (__v2di) __B,
+ (__v2di) __C,
+ (unsigned char) __imm,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ternarylogic_epi32 (__m128i __A, __m128i __B, __m128i __C,
+ const int __imm)
+{
+ return (__m128i)
+ __builtin_ia32_pternlogd128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si) __C,
+ (unsigned char) __imm,
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_ternarylogic_epi32 (__m128i __A, __mmask8 __U,
+ __m128i __B, __m128i __C,
+ const int __imm)
+{
+ return (__m128i)
+ __builtin_ia32_pternlogd128_mask ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si) __C,
+ (unsigned char) __imm,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_ternarylogic_epi32 (__mmask8 __U, __m128i __A,
+ __m128i __B, __m128i __C,
+ const int __imm)
+{
+ return (__m128i)
+ __builtin_ia32_pternlogd128_maskz ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si) __C,
+ (unsigned char) __imm,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_roundscale_ps (__m256 __A, const int __imm)
+{
+ return (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A,
+ __imm,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_roundscale_ps (__m256 __W, __mmask8 __U, __m256 __A,
+ const int __imm)
+{
+ return (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A,
+ __imm,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_roundscale_ps (__mmask8 __U, __m256 __A, const int __imm)
+{
+ return (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A,
+ __imm,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_roundscale_pd (__m256d __A, const int __imm)
+{
+ return (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A,
+ __imm,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_roundscale_pd (__m256d __W, __mmask8 __U, __m256d __A,
+ const int __imm)
+{
+ return (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A,
+ __imm,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_roundscale_pd (__mmask8 __U, __m256d __A, const int __imm)
+{
+ return (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A,
+ __imm,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roundscale_ps (__m128 __A, const int __imm)
+{
+ return (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A,
+ __imm,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_roundscale_ps (__m128 __W, __mmask8 __U, __m128 __A,
+ const int __imm)
+{
+ return (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A,
+ __imm,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_roundscale_ps (__mmask8 __U, __m128 __A, const int __imm)
+{
+ return (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A,
+ __imm,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roundscale_pd (__m128d __A, const int __imm)
+{
+ return (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A,
+ __imm,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_roundscale_pd (__m128d __W, __mmask8 __U, __m128d __A,
+ const int __imm)
+{
+ return (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A,
+ __imm,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_roundscale_pd (__mmask8 __U, __m128d __A, const int __imm)
+{
+ return (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A,
+ __imm,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_getmant_ps (__m256 __A, _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m256) __builtin_ia32_getmantps256_mask ((__v8sf) __A,
+ (__C << 2) | __B,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_getmant_ps (__m256 __W, __mmask8 __U, __m256 __A,
+ _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m256) __builtin_ia32_getmantps256_mask ((__v8sf) __A,
+ (__C << 2) | __B,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_getmant_ps (__mmask8 __U, __m256 __A,
+ _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m256) __builtin_ia32_getmantps256_mask ((__v8sf) __A,
+ (__C << 2) | __B,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getmant_ps (__m128 __A, _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m128) __builtin_ia32_getmantps128_mask ((__v4sf) __A,
+ (__C << 2) | __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getmant_ps (__m128 __W, __mmask8 __U, __m128 __A,
+ _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m128) __builtin_ia32_getmantps128_mask ((__v4sf) __A,
+ (__C << 2) | __B,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getmant_ps (__mmask8 __U, __m128 __A,
+ _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m128) __builtin_ia32_getmantps128_mask ((__v4sf) __A,
+ (__C << 2) | __B,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_getmant_pd (__m256d __A, _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m256d) __builtin_ia32_getmantpd256_mask ((__v4df) __A,
+ (__C << 2) | __B,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_getmant_pd (__m256d __W, __mmask8 __U, __m256d __A,
+ _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m256d) __builtin_ia32_getmantpd256_mask ((__v4df) __A,
+ (__C << 2) | __B,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_getmant_pd (__mmask8 __U, __m256d __A,
+ _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m256d) __builtin_ia32_getmantpd256_mask ((__v4df) __A,
+ (__C << 2) | __B,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getmant_pd (__m128d __A, _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m128d) __builtin_ia32_getmantpd128_mask ((__v2df) __A,
+ (__C << 2) | __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getmant_pd (__m128d __W, __mmask8 __U, __m128d __A,
+ _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m128d) __builtin_ia32_getmantpd128_mask ((__v2df) __A,
+ (__C << 2) | __B,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getmant_pd (__mmask8 __U, __m128d __A,
+ _MM_MANTISSA_NORM_ENUM __B,
+ _MM_MANTISSA_SIGN_ENUM __C)
+{
+ return (__m128d) __builtin_ia32_getmantpd128_mask ((__v2df) __A,
+ (__C << 2) | __B,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mmask_i32gather_ps (__m256 __v1_old, __mmask8 __mask,
+ __m256i __index, void const *__addr,
+ int __scale)
+{
+ return (__m256) __builtin_ia32_gather3siv8sf ((__v8sf) __v1_old,
+ __addr,
+ (__v8si) __index,
+ __mask, __scale);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mmask_i32gather_ps (__m128 __v1_old, __mmask8 __mask,
+ __m128i __index, void const *__addr,
+ int __scale)
+{
+ return (__m128) __builtin_ia32_gather3siv4sf ((__v4sf) __v1_old,
+ __addr,
+ (__v4si) __index,
+ __mask, __scale);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mmask_i32gather_pd (__m256d __v1_old, __mmask8 __mask,
+ __m128i __index, void const *__addr,
+ int __scale)
+{
+ return (__m256d) __builtin_ia32_gather3siv4df ((__v4df) __v1_old,
+ __addr,
+ (__v4si) __index,
+ __mask, __scale);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mmask_i32gather_pd (__m128d __v1_old, __mmask8 __mask,
+ __m128i __index, void const *__addr,
+ int __scale)
+{
+ return (__m128d) __builtin_ia32_gather3siv2df ((__v2df) __v1_old,
+ __addr,
+ (__v4si) __index,
+ __mask, __scale);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mmask_i64gather_ps (__m128 __v1_old, __mmask8 __mask,
+ __m256i __index, void const *__addr,
+ int __scale)
+{
+ return (__m128) __builtin_ia32_gather3div8sf ((__v4sf) __v1_old,
+ __addr,
+ (__v4di) __index,
+ __mask, __scale);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mmask_i64gather_ps (__m128 __v1_old, __mmask8 __mask,
+ __m128i __index, void const *__addr,
+ int __scale)
+{
+ return (__m128) __builtin_ia32_gather3div4sf ((__v4sf) __v1_old,
+ __addr,
+ (__v2di) __index,
+ __mask, __scale);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mmask_i64gather_pd (__m256d __v1_old, __mmask8 __mask,
+ __m256i __index, void const *__addr,
+ int __scale)
+{
+ return (__m256d) __builtin_ia32_gather3div4df ((__v4df) __v1_old,
+ __addr,
+ (__v4di) __index,
+ __mask, __scale);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mmask_i64gather_pd (__m128d __v1_old, __mmask8 __mask,
+ __m128i __index, void const *__addr,
+ int __scale)
+{
+ return (__m128d) __builtin_ia32_gather3div2df ((__v2df) __v1_old,
+ __addr,
+ (__v2di) __index,
+ __mask, __scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mmask_i32gather_epi32 (__m256i __v1_old, __mmask8 __mask,
+ __m256i __index, void const *__addr,
+ int __scale)
+{
+ return (__m256i) __builtin_ia32_gather3siv8si ((__v8si) __v1_old,
+ __addr,
+ (__v8si) __index,
+ __mask, __scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mmask_i32gather_epi32 (__m128i __v1_old, __mmask8 __mask,
+ __m128i __index, void const *__addr,
+ int __scale)
+{
+ return (__m128i) __builtin_ia32_gather3siv4si ((__v4si) __v1_old,
+ __addr,
+ (__v4si) __index,
+ __mask, __scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mmask_i32gather_epi64 (__m256i __v1_old, __mmask8 __mask,
+ __m128i __index, void const *__addr,
+ int __scale)
+{
+ return (__m256i) __builtin_ia32_gather3siv4di ((__v4di) __v1_old,
+ __addr,
+ (__v4si) __index,
+ __mask, __scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mmask_i32gather_epi64 (__m128i __v1_old, __mmask8 __mask,
+ __m128i __index, void const *__addr,
+ int __scale)
+{
+ return (__m128i) __builtin_ia32_gather3siv2di ((__v2di) __v1_old,
+ __addr,
+ (__v4si) __index,
+ __mask, __scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mmask_i64gather_epi32 (__m128i __v1_old, __mmask8 __mask,
+ __m256i __index, void const *__addr,
+ int __scale)
+{
+ return (__m128i) __builtin_ia32_gather3div8si ((__v4si) __v1_old,
+ __addr,
+ (__v4di) __index,
+ __mask, __scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mmask_i64gather_epi32 (__m128i __v1_old, __mmask8 __mask,
+ __m128i __index, void const *__addr,
+ int __scale)
+{
+ return (__m128i) __builtin_ia32_gather3div4si ((__v4si) __v1_old,
+ __addr,
+ (__v2di) __index,
+ __mask, __scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mmask_i64gather_epi64 (__m256i __v1_old, __mmask8 __mask,
+ __m256i __index, void const *__addr,
+ int __scale)
+{
+ return (__m256i) __builtin_ia32_gather3div4di ((__v4di) __v1_old,
+ __addr,
+ (__v4di) __index,
+ __mask, __scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mmask_i64gather_epi64 (__m128i __v1_old, __mmask8 __mask,
+ __m128i __index, void const *__addr,
+ int __scale)
+{
+ return (__m128i) __builtin_ia32_gather3div2di ((__v2di) __v1_old,
+ __addr,
+ (__v2di) __index,
+ __mask, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i32scatter_ps (void *__addr, __m256i __index,
+ __m256 __v1, const int __scale)
+{
+ __builtin_ia32_scattersiv8sf (__addr, (__mmask8) 0xFF,
+ (__v8si) __index, (__v8sf) __v1,
+ __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i32scatter_ps (void *__addr, __mmask8 __mask,
+ __m256i __index, __m256 __v1,
+ const int __scale)
+{
+ __builtin_ia32_scattersiv8sf (__addr, __mask, (__v8si) __index,
+ (__v8sf) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i32scatter_ps (void *__addr, __m128i __index, __m128 __v1,
+ const int __scale)
+{
+ __builtin_ia32_scattersiv4sf (__addr, (__mmask8) 0xFF,
+ (__v4si) __index, (__v4sf) __v1,
+ __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i32scatter_ps (void *__addr, __mmask8 __mask,
+ __m128i __index, __m128 __v1,
+ const int __scale)
+{
+ __builtin_ia32_scattersiv4sf (__addr, __mask, (__v4si) __index,
+ (__v4sf) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i32scatter_pd (void *__addr, __m128i __index,
+ __m256d __v1, const int __scale)
+{
+ __builtin_ia32_scattersiv4df (__addr, (__mmask8) 0xFF,
+ (__v4si) __index, (__v4df) __v1,
+ __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i32scatter_pd (void *__addr, __mmask8 __mask,
+ __m128i __index, __m256d __v1,
+ const int __scale)
+{
+ __builtin_ia32_scattersiv4df (__addr, __mask, (__v4si) __index,
+ (__v4df) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i32scatter_pd (void *__addr, __m128i __index,
+ __m128d __v1, const int __scale)
+{
+ __builtin_ia32_scattersiv2df (__addr, (__mmask8) 0xFF,
+ (__v4si) __index, (__v2df) __v1,
+ __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i32scatter_pd (void *__addr, __mmask8 __mask,
+ __m128i __index, __m128d __v1,
+ const int __scale)
+{
+ __builtin_ia32_scattersiv2df (__addr, __mask, (__v4si) __index,
+ (__v2df) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i64scatter_ps (void *__addr, __m256i __index,
+ __m128 __v1, const int __scale)
+{
+ __builtin_ia32_scatterdiv8sf (__addr, (__mmask8) 0xFF,
+ (__v4di) __index, (__v4sf) __v1,
+ __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i64scatter_ps (void *__addr, __mmask8 __mask,
+ __m256i __index, __m128 __v1,
+ const int __scale)
+{
+ __builtin_ia32_scatterdiv8sf (__addr, __mask, (__v4di) __index,
+ (__v4sf) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i64scatter_ps (void *__addr, __m128i __index, __m128 __v1,
+ const int __scale)
+{
+ __builtin_ia32_scatterdiv4sf (__addr, (__mmask8) 0xFF,
+ (__v2di) __index, (__v4sf) __v1,
+ __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i64scatter_ps (void *__addr, __mmask8 __mask,
+ __m128i __index, __m128 __v1,
+ const int __scale)
+{
+ __builtin_ia32_scatterdiv4sf (__addr, __mask, (__v2di) __index,
+ (__v4sf) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i64scatter_pd (void *__addr, __m256i __index,
+ __m256d __v1, const int __scale)
+{
+ __builtin_ia32_scatterdiv4df (__addr, (__mmask8) 0xFF,
+ (__v4di) __index, (__v4df) __v1,
+ __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i64scatter_pd (void *__addr, __mmask8 __mask,
+ __m256i __index, __m256d __v1,
+ const int __scale)
+{
+ __builtin_ia32_scatterdiv4df (__addr, __mask, (__v4di) __index,
+ (__v4df) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i64scatter_pd (void *__addr, __m128i __index,
+ __m128d __v1, const int __scale)
+{
+ __builtin_ia32_scatterdiv2df (__addr, (__mmask8) 0xFF,
+ (__v2di) __index, (__v2df) __v1,
+ __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i64scatter_pd (void *__addr, __mmask8 __mask,
+ __m128i __index, __m128d __v1,
+ const int __scale)
+{
+ __builtin_ia32_scatterdiv2df (__addr, __mask, (__v2di) __index,
+ (__v2df) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i32scatter_epi32 (void *__addr, __m256i __index,
+ __m256i __v1, const int __scale)
+{
+ __builtin_ia32_scattersiv8si (__addr, (__mmask8) 0xFF,
+ (__v8si) __index, (__v8si) __v1,
+ __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i32scatter_epi32 (void *__addr, __mmask8 __mask,
+ __m256i __index, __m256i __v1,
+ const int __scale)
+{
+ __builtin_ia32_scattersiv8si (__addr, __mask, (__v8si) __index,
+ (__v8si) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i32scatter_epi32 (void *__addr, __m128i __index,
+ __m128i __v1, const int __scale)
+{
+ __builtin_ia32_scattersiv4si (__addr, (__mmask8) 0xFF,
+ (__v4si) __index, (__v4si) __v1,
+ __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i32scatter_epi32 (void *__addr, __mmask8 __mask,
+ __m128i __index, __m128i __v1,
+ const int __scale)
+{
+ __builtin_ia32_scattersiv4si (__addr, __mask, (__v4si) __index,
+ (__v4si) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i32scatter_epi64 (void *__addr, __m128i __index,
+ __m256i __v1, const int __scale)
+{
+ __builtin_ia32_scattersiv4di (__addr, (__mmask8) 0xFF,
+ (__v4si) __index, (__v4di) __v1,
+ __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i32scatter_epi64 (void *__addr, __mmask8 __mask,
+ __m128i __index, __m256i __v1,
+ const int __scale)
+{
+ __builtin_ia32_scattersiv4di (__addr, __mask, (__v4si) __index,
+ (__v4di) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i32scatter_epi64 (void *__addr, __m128i __index,
+ __m128i __v1, const int __scale)
+{
+ __builtin_ia32_scattersiv2di (__addr, (__mmask8) 0xFF,
+ (__v4si) __index, (__v2di) __v1,
+ __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i32scatter_epi64 (void *__addr, __mmask8 __mask,
+ __m128i __index, __m128i __v1,
+ const int __scale)
+{
+ __builtin_ia32_scattersiv2di (__addr, __mask, (__v4si) __index,
+ (__v2di) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i64scatter_epi32 (void *__addr, __m256i __index,
+ __m128i __v1, const int __scale)
+{
+ __builtin_ia32_scatterdiv8si (__addr, (__mmask8) 0xFF,
+ (__v4di) __index, (__v4si) __v1,
+ __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i64scatter_epi32 (void *__addr, __mmask8 __mask,
+ __m256i __index, __m128i __v1,
+ const int __scale)
+{
+ __builtin_ia32_scatterdiv8si (__addr, __mask, (__v4di) __index,
+ (__v4si) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i64scatter_epi32 (void *__addr, __m128i __index,
+ __m128i __v1, const int __scale)
+{
+ __builtin_ia32_scatterdiv4si (__addr, (__mmask8) 0xFF,
+ (__v2di) __index, (__v4si) __v1,
+ __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i64scatter_epi32 (void *__addr, __mmask8 __mask,
+ __m128i __index, __m128i __v1,
+ const int __scale)
+{
+ __builtin_ia32_scatterdiv4si (__addr, __mask, (__v2di) __index,
+ (__v4si) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i64scatter_epi64 (void *__addr, __m256i __index,
+ __m256i __v1, const int __scale)
+{
+ __builtin_ia32_scatterdiv4di (__addr, (__mmask8) 0xFF,
+ (__v4di) __index, (__v4di) __v1,
+ __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i64scatter_epi64 (void *__addr, __mmask8 __mask,
+ __m256i __index, __m256i __v1,
+ const int __scale)
+{
+ __builtin_ia32_scatterdiv4di (__addr, __mask, (__v4di) __index,
+ (__v4di) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i64scatter_epi64 (void *__addr, __m128i __index,
+ __m128i __v1, const int __scale)
+{
+ __builtin_ia32_scatterdiv2di (__addr, (__mmask8) 0xFF,
+ (__v2di) __index, (__v2di) __v1,
+ __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i64scatter_epi64 (void *__addr, __mmask8 __mask,
+ __m128i __index, __m128i __v1,
+ const int __scale)
+{
+ __builtin_ia32_scatterdiv2di (__addr, __mask, (__v2di) __index,
+ (__v2di) __v1, __scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shuffle_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+ _MM_PERM_ENUM __mask)
+{
+ return (__m256i) __builtin_ia32_pshufd256_mask ((__v8si) __A, __mask,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shuffle_epi32 (__mmask8 __U, __m256i __A,
+ _MM_PERM_ENUM __mask)
+{
+ return (__m256i) __builtin_ia32_pshufd256_mask ((__v8si) __A, __mask,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shuffle_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+ _MM_PERM_ENUM __mask)
+{
+ return (__m128i) __builtin_ia32_pshufd128_mask ((__v4si) __A, __mask,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shuffle_epi32 (__mmask8 __U, __m128i __A,
+ _MM_PERM_ENUM __mask)
+{
+ return (__m128i) __builtin_ia32_pshufd128_mask ((__v4si) __A, __mask,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rol_epi32 (__m256i __A, const int __B)
+{
+ return (__m256i) __builtin_ia32_prold256_mask ((__v8si) __A, __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_rol_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+ const int __B)
+{
+ return (__m256i) __builtin_ia32_prold256_mask ((__v8si) __A, __B,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_rol_epi32 (__mmask8 __U, __m256i __A, const int __B)
+{
+ return (__m256i) __builtin_ia32_prold256_mask ((__v8si) __A, __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rol_epi32 (__m128i __A, const int __B)
+{
+ return (__m128i) __builtin_ia32_prold128_mask ((__v4si) __A, __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rol_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+ const int __B)
+{
+ return (__m128i) __builtin_ia32_prold128_mask ((__v4si) __A, __B,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rol_epi32 (__mmask8 __U, __m128i __A, const int __B)
+{
+ return (__m128i) __builtin_ia32_prold128_mask ((__v4si) __A, __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_ror_epi32 (__m256i __A, const int __B)
+{
+ return (__m256i) __builtin_ia32_prord256_mask ((__v8si) __A, __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_ror_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+ const int __B)
+{
+ return (__m256i) __builtin_ia32_prord256_mask ((__v8si) __A, __B,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_ror_epi32 (__mmask8 __U, __m256i __A, const int __B)
+{
+ return (__m256i) __builtin_ia32_prord256_mask ((__v8si) __A, __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ror_epi32 (__m128i __A, const int __B)
+{
+ return (__m128i) __builtin_ia32_prord128_mask ((__v4si) __A, __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_ror_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+ const int __B)
+{
+ return (__m128i) __builtin_ia32_prord128_mask ((__v4si) __A, __B,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_ror_epi32 (__mmask8 __U, __m128i __A, const int __B)
+{
+ return (__m128i) __builtin_ia32_prord128_mask ((__v4si) __A, __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rol_epi64 (__m256i __A, const int __B)
+{
+ return (__m256i) __builtin_ia32_prolq256_mask ((__v4di) __A, __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_rol_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+ const int __B)
+{
+ return (__m256i) __builtin_ia32_prolq256_mask ((__v4di) __A, __B,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_rol_epi64 (__mmask8 __U, __m256i __A, const int __B)
+{
+ return (__m256i) __builtin_ia32_prolq256_mask ((__v4di) __A, __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rol_epi64 (__m128i __A, const int __B)
+{
+ return (__m128i) __builtin_ia32_prolq128_mask ((__v2di) __A, __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rol_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+ const int __B)
+{
+ return (__m128i) __builtin_ia32_prolq128_mask ((__v2di) __A, __B,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rol_epi64 (__mmask8 __U, __m128i __A, const int __B)
+{
+ return (__m128i) __builtin_ia32_prolq128_mask ((__v2di) __A, __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_ror_epi64 (__m256i __A, const int __B)
+{
+ return (__m256i) __builtin_ia32_prorq256_mask ((__v4di) __A, __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_ror_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+ const int __B)
+{
+ return (__m256i) __builtin_ia32_prorq256_mask ((__v4di) __A, __B,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_ror_epi64 (__mmask8 __U, __m256i __A, const int __B)
+{
+ return (__m256i) __builtin_ia32_prorq256_mask ((__v4di) __A, __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ror_epi64 (__m128i __A, const int __B)
+{
+ return (__m128i) __builtin_ia32_prorq128_mask ((__v2di) __A, __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_ror_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+ const int __B)
+{
+ return (__m128i) __builtin_ia32_prorq128_mask ((__v2di) __A, __B,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_ror_epi64 (__mmask8 __U, __m128i __A, const int __B)
+{
+ return (__m128i) __builtin_ia32_prorq128_mask ((__v2di) __A, __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_alignr_epi32 (__m128i __A, __m128i __B, const int __imm)
+{
+ return (__m128i) __builtin_ia32_alignd128_mask ((__v4si) __A,
+ (__v4si) __B, __imm,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_alignr_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B, const int __imm)
+{
+ return (__m128i) __builtin_ia32_alignd128_mask ((__v4si) __A,
+ (__v4si) __B, __imm,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_alignr_epi32 (__mmask8 __U, __m128i __A, __m128i __B,
+ const int __imm)
+{
+ return (__m128i) __builtin_ia32_alignd128_mask ((__v4si) __A,
+ (__v4si) __B, __imm,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_alignr_epi64 (__m128i __A, __m128i __B, const int __imm)
+{
+ return (__m128i) __builtin_ia32_alignq128_mask ((__v2di) __A,
+ (__v2di) __B, __imm,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_alignr_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+ __m128i __B, const int __imm)
+{
+ return (__m128i) __builtin_ia32_alignq128_mask ((__v2di) __A,
+ (__v2di) __B, __imm,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_alignr_epi64 (__mmask8 __U, __m128i __A, __m128i __B,
+ const int __imm)
+{
+ return (__m128i) __builtin_ia32_alignq128_mask ((__v2di) __A,
+ (__v2di) __B, __imm,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_alignr_epi32 (__m256i __A, __m256i __B, const int __imm)
+{
+ return (__m256i) __builtin_ia32_alignd256_mask ((__v8si) __A,
+ (__v8si) __B, __imm,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_alignr_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B, const int __imm)
+{
+ return (__m256i) __builtin_ia32_alignd256_mask ((__v8si) __A,
+ (__v8si) __B, __imm,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_alignr_epi32 (__mmask8 __U, __m256i __A, __m256i __B,
+ const int __imm)
+{
+ return (__m256i) __builtin_ia32_alignd256_mask ((__v8si) __A,
+ (__v8si) __B, __imm,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_alignr_epi64 (__m256i __A, __m256i __B, const int __imm)
+{
+ return (__m256i) __builtin_ia32_alignq256_mask ((__v4di) __A,
+ (__v4di) __B, __imm,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_alignr_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+ __m256i __B, const int __imm)
+{
+ return (__m256i) __builtin_ia32_alignq256_mask ((__v4di) __A,
+ (__v4di) __B, __imm,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_alignr_epi64 (__mmask8 __U, __m256i __A, __m256i __B,
+ const int __imm)
+{
+ return (__m256i) __builtin_ia32_alignq256_mask ((__v4di) __A,
+ (__v4di) __B, __imm,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtps_ph (__m128i __W, __mmask8 __U, __m128 __A,
+ const int __I)
+{
+ return (__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf) __A, __I,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtps_ph (__mmask8 __U, __m128 __A, const int __I)
+{
+ return (__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf) __A, __I,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtps_ph (__m128i __W, __mmask8 __U, __m256 __A,
+ const int __I)
+{
+ return (__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf) __A, __I,
+ (__v8hi) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtps_ph (__mmask8 __U, __m256 __A, const int __I)
+{
+ return (__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf) __A, __I,
+ (__v8hi)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_srai_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+ const int __imm)
+{
+ return (__m256i) __builtin_ia32_psradi256_mask ((__v8si) __A, __imm,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_srai_epi32 (__mmask8 __U, __m256i __A, const int __imm)
+{
+ return (__m256i) __builtin_ia32_psradi256_mask ((__v8si) __A, __imm,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_srai_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+ const int __imm)
+{
+ return (__m128i) __builtin_ia32_psradi128_mask ((__v4si) __A, __imm,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_srai_epi32 (__mmask8 __U, __m128i __A, const int __imm)
+{
+ return (__m128i) __builtin_ia32_psradi128_mask ((__v4si) __A, __imm,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srai_epi64 (__m256i __A, const int __imm)
+{
+ return (__m256i) __builtin_ia32_psraqi256_mask ((__v4di) __A, __imm,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_srai_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+ const int __imm)
+{
+ return (__m256i) __builtin_ia32_psraqi256_mask ((__v4di) __A, __imm,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_srai_epi64 (__mmask8 __U, __m256i __A, const int __imm)
+{
+ return (__m256i) __builtin_ia32_psraqi256_mask ((__v4di) __A, __imm,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srai_epi64 (__m128i __A, const int __imm)
+{
+ return (__m128i) __builtin_ia32_psraqi128_mask ((__v2di) __A, __imm,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_srai_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+ const int __imm)
+{
+ return (__m128i) __builtin_ia32_psraqi128_mask ((__v2di) __A, __imm,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_srai_epi64 (__mmask8 __U, __m128i __A, const int __imm)
+{
+ return (__m128i) __builtin_ia32_psraqi128_mask ((__v2di) __A, __imm,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_slli_epi32 (__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+ return (__m128i) __builtin_ia32_pslldi128_mask ((__v4si) __A, __B,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_slli_epi32 (__mmask8 __U, __m128i __A, int __B)
+{
+ return (__m128i) __builtin_ia32_pslldi128_mask ((__v4si) __A, __B,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_slli_epi64 (__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+ return (__m128i) __builtin_ia32_psllqi128_mask ((__v2di) __A, __B,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_slli_epi64 (__mmask8 __U, __m128i __A, int __B)
+{
+ return (__m128i) __builtin_ia32_psllqi128_mask ((__v2di) __A, __B,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_slli_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+ int __B)
+{
+ return (__m256i) __builtin_ia32_pslldi256_mask ((__v8si) __A, __B,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_slli_epi32 (__mmask8 __U, __m256i __A, int __B)
+{
+ return (__m256i) __builtin_ia32_pslldi256_mask ((__v8si) __A, __B,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_slli_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+ int __B)
+{
+ return (__m256i) __builtin_ia32_psllqi256_mask ((__v4di) __A, __B,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_slli_epi64 (__mmask8 __U, __m256i __A, int __B)
+{
+ return (__m256i) __builtin_ia32_psllqi256_mask ((__v4di) __A, __B,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutex_pd (__m256d __W, __mmask8 __U, __m256d __X,
+ const int __imm)
+{
+ return (__m256d) __builtin_ia32_permdf256_mask ((__v4df) __X, __imm,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutex_pd (__mmask8 __U, __m256d __X, const int __imm)
+{
+ return (__m256d) __builtin_ia32_permdf256_mask ((__v4df) __X, __imm,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permute_pd (__m256d __W, __mmask8 __U, __m256d __X,
+ const int __C)
+{
+ return (__m256d) __builtin_ia32_vpermilpd256_mask ((__v4df) __X, __C,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permute_pd (__mmask8 __U, __m256d __X, const int __C)
+{
+ return (__m256d) __builtin_ia32_vpermilpd256_mask ((__v4df) __X, __C,
+ (__v4df)
+ _mm256_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_permute_pd (__m128d __W, __mmask8 __U, __m128d __X,
+ const int __C)
+{
+ return (__m128d) __builtin_ia32_vpermilpd_mask ((__v2df) __X, __C,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_permute_pd (__mmask8 __U, __m128d __X, const int __C)
+{
+ return (__m128d) __builtin_ia32_vpermilpd_mask ((__v2df) __X, __C,
+ (__v2df)
+ _mm_setzero_pd (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permute_ps (__m256 __W, __mmask8 __U, __m256 __X,
+ const int __C)
+{
+ return (__m256) __builtin_ia32_vpermilps256_mask ((__v8sf) __X, __C,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permute_ps (__mmask8 __U, __m256 __X, const int __C)
+{
+ return (__m256) __builtin_ia32_vpermilps256_mask ((__v8sf) __X, __C,
+ (__v8sf)
+ _mm256_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_permute_ps (__m128 __W, __mmask8 __U, __m128 __X,
+ const int __C)
+{
+ return (__m128) __builtin_ia32_vpermilps_mask ((__v4sf) __X, __C,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_permute_ps (__mmask8 __U, __m128 __X, const int __C)
+{
+ return (__m128) __builtin_ia32_vpermilps_mask ((__v4sf) __X, __C,
+ (__v4sf)
+ _mm_setzero_ps (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_blend_pd (__mmask8 __U, __m256d __A, __m256d __W)
+{
+ return (__m256d) __builtin_ia32_blendmpd_256_mask ((__v4df) __A,
+ (__v4df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_blend_ps (__mmask8 __U, __m256 __A, __m256 __W)
+{
+ return (__m256) __builtin_ia32_blendmps_256_mask ((__v8sf) __A,
+ (__v8sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_blend_epi64 (__mmask8 __U, __m256i __A, __m256i __W)
+{
+ return (__m256i) __builtin_ia32_blendmq_256_mask ((__v4di) __A,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_blend_epi32 (__mmask8 __U, __m256i __A, __m256i __W)
+{
+ return (__m256i) __builtin_ia32_blendmd_256_mask ((__v8si) __A,
+ (__v8si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_blend_pd (__mmask8 __U, __m128d __A, __m128d __W)
+{
+ return (__m128d) __builtin_ia32_blendmpd_128_mask ((__v2df) __A,
+ (__v2df) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_blend_ps (__mmask8 __U, __m128 __A, __m128 __W)
+{
+ return (__m128) __builtin_ia32_blendmps_128_mask ((__v4sf) __A,
+ (__v4sf) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_blend_epi64 (__mmask8 __U, __m128i __A, __m128i __W)
+{
+ return (__m128i) __builtin_ia32_blendmq_128_mask ((__v2di) __A,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_blend_epi32 (__mmask8 __U, __m128i __A, __m128i __W)
+{
+ return (__m128i) __builtin_ia32_blendmd_128_mask ((__v4si) __A,
+ (__v4si) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_epi64_mask (__m256i __X, __m256i __Y, const int __P)
+{
+ return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+ (__v4di) __Y, __P,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_epi32_mask (__m256i __X, __m256i __Y, const int __P)
+{
+ return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+ (__v8si) __Y, __P,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_epu64_mask (__m256i __X, __m256i __Y, const int __P)
+{
+ return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+ (__v4di) __Y, __P,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_epu32_mask (__m256i __X, __m256i __Y, const int __P)
+{
+ return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+ (__v8si) __Y, __P,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_pd_mask (__m256d __X, __m256d __Y, const int __P)
+{
+ return (__mmask8) __builtin_ia32_cmppd256_mask ((__v4df) __X,
+ (__v4df) __Y, __P,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_ps_mask (__m256 __X, __m256 __Y, const int __P)
+{
+ return (__mmask8) __builtin_ia32_cmpps256_mask ((__v8sf) __X,
+ (__v8sf) __Y, __P,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmp_epi64_mask (__mmask8 __U, __m256i __X, __m256i __Y,
+ const int __P)
+{
+ return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+ (__v4di) __Y, __P,
+ (__mmask8) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmp_epi32_mask (__mmask8 __U, __m256i __X, __m256i __Y,
+ const int __P)
+{
+ return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+ (__v8si) __Y, __P,
+ (__mmask8) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmp_epu64_mask (__mmask8 __U, __m256i __X, __m256i __Y,
+ const int __P)
+{
+ return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+ (__v4di) __Y, __P,
+ (__mmask8) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmp_epu32_mask (__mmask8 __U, __m256i __X, __m256i __Y,
+ const int __P)
+{
+ return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+ (__v8si) __Y, __P,
+ (__mmask8) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmp_pd_mask (__mmask8 __U, __m256d __X, __m256d __Y,
+ const int __P)
+{
+ return (__mmask8) __builtin_ia32_cmppd256_mask ((__v4df) __X,
+ (__v4df) __Y, __P,
+ (__mmask8) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmp_ps_mask (__mmask8 __U, __m256 __X, __m256 __Y,
+ const int __P)
+{
+ return (__mmask8) __builtin_ia32_cmpps256_mask ((__v8sf) __X,
+ (__v8sf) __Y, __P,
+ (__mmask8) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_epi64_mask (__m128i __X, __m128i __Y, const int __P)
+{
+ return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
+ (__v2di) __Y, __P,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_epi32_mask (__m128i __X, __m128i __Y, const int __P)
+{
+ return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
+ (__v4si) __Y, __P,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_epu64_mask (__m128i __X, __m128i __Y, const int __P)
+{
+ return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
+ (__v2di) __Y, __P,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_epu32_mask (__m128i __X, __m128i __Y, const int __P)
+{
+ return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
+ (__v4si) __Y, __P,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_pd_mask (__m128d __X, __m128d __Y, const int __P)
+{
+ return (__mmask8) __builtin_ia32_cmppd128_mask ((__v2df) __X,
+ (__v2df) __Y, __P,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_ps_mask (__m128 __X, __m128 __Y, const int __P)
+{
+ return (__mmask8) __builtin_ia32_cmpps128_mask ((__v4sf) __X,
+ (__v4sf) __Y, __P,
+ (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_epi64_mask (__mmask8 __U, __m128i __X, __m128i __Y,
+ const int __P)
+{
+ return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
+ (__v2di) __Y, __P,
+ (__mmask8) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_epi32_mask (__mmask8 __U, __m128i __X, __m128i __Y,
+ const int __P)
+{
+ return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
+ (__v4si) __Y, __P,
+ (__mmask8) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_epu64_mask (__mmask8 __U, __m128i __X, __m128i __Y,
+ const int __P)
+{
+ return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
+ (__v2di) __Y, __P,
+ (__mmask8) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_epu32_mask (__mmask8 __U, __m128i __X, __m128i __Y,
+ const int __P)
+{
+ return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
+ (__v4si) __Y, __P,
+ (__mmask8) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_pd_mask (__mmask8 __U, __m128d __X, __m128d __Y,
+ const int __P)
+{
+ return (__mmask8) __builtin_ia32_cmppd128_mask ((__v2df) __X,
+ (__v2df) __Y, __P,
+ (__mmask8) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_ps_mask (__mmask8 __U, __m128 __X, __m128 __Y,
+ const int __P)
+{
+ return (__mmask8) __builtin_ia32_cmpps128_mask ((__v4sf) __X,
+ (__v4sf) __Y, __P,
+ (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutex_pd (__m256d __X, const int __M)
+{
+ return (__m256d) __builtin_ia32_permdf256_mask ((__v4df) __X, __M,
+ (__v4df)
+ _mm256_undefined_pd (),
+ (__mmask8) -1);
+}
+
+#else
+#define _mm256_permutex_pd(X, M) \
+ ((__m256d) __builtin_ia32_permdf256_mask ((__v4df)(__m256d)(X), (int)(M), \
+ (__v4df)(__m256d) \
+ _mm256_undefined_pd (), \
+ (__mmask8)-1))
+
+#define _mm256_permutex_epi64(X, I) \
+ ((__m256i) __builtin_ia32_permdi256_mask ((__v4di)(__m256i)(X), \
+ (int)(I), \
+ (__v4di)(__m256i) \
+ (_mm256_setzero_si256 ()),\
+ (__mmask8) -1))
+
+#define _mm256_maskz_permutex_epi64(M, X, I) \
+ ((__m256i) __builtin_ia32_permdi256_mask ((__v4di)(__m256i)(X), \
+ (int)(I), \
+ (__v4di)(__m256i) \
+ (_mm256_setzero_si256 ()),\
+ (__mmask8)(M)))
+
+#define _mm256_mask_permutex_epi64(W, M, X, I) \
+ ((__m256i) __builtin_ia32_permdi256_mask ((__v4di)(__m256i)(X), \
+ (int)(I), \
+ (__v4di)(__m256i)(W), \
+ (__mmask8)(M)))
+
+#define _mm256_insertf32x4(X, Y, C) \
+ ((__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf)(__m256) (X), \
+ (__v4sf)(__m128) (Y), (int) (C), \
+ (__v8sf)(__m256)_mm256_setzero_ps (), \
+ (__mmask8)-1))
+
+#define _mm256_mask_insertf32x4(W, U, X, Y, C) \
+ ((__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf)(__m256) (X), \
+ (__v4sf)(__m128) (Y), (int) (C), \
+ (__v8sf)(__m256)(W), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_insertf32x4(U, X, Y, C) \
+ ((__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf)(__m256) (X), \
+ (__v4sf)(__m128) (Y), (int) (C), \
+ (__v8sf)(__m256)_mm256_setzero_ps (), \
+ (__mmask8)(U)))
+
+#define _mm256_inserti32x4(X, Y, C) \
+ ((__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si)(__m256i) (X),\
+ (__v4si)(__m128i) (Y), (int) (C), \
+ (__v8si)(__m256i)_mm256_setzero_si256 (), \
+ (__mmask8)-1))
+
+#define _mm256_mask_inserti32x4(W, U, X, Y, C) \
+ ((__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si)(__m256i) (X),\
+ (__v4si)(__m128i) (Y), (int) (C), \
+ (__v8si)(__m256i)(W), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_inserti32x4(U, X, Y, C) \
+ ((__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si)(__m256i) (X),\
+ (__v4si)(__m128i) (Y), (int) (C), \
+ (__v8si)(__m256i)_mm256_setzero_si256 (), \
+ (__mmask8)(U)))
+
+#define _mm256_extractf32x4_ps(X, C) \
+ ((__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf)(__m256) (X), \
+ (int) (C), \
+ (__v4sf)(__m128)_mm_setzero_ps (), \
+ (__mmask8)-1))
+
+#define _mm256_mask_extractf32x4_ps(W, U, X, C) \
+ ((__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf)(__m256) (X), \
+ (int) (C), \
+ (__v4sf)(__m128)(W), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_extractf32x4_ps(U, X, C) \
+ ((__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf)(__m256) (X), \
+ (int) (C), \
+ (__v4sf)(__m128)_mm_setzero_ps (), \
+ (__mmask8)(U)))
+
+#define _mm256_extracti32x4_epi32(X, C) \
+ ((__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si)(__m256i) (X),\
+ (int) (C), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)-1))
+
+#define _mm256_mask_extracti32x4_epi32(W, U, X, C) \
+ ((__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si)(__m256i) (X),\
+ (int) (C), (__v4si)(__m128i)(W), (__mmask8)(U)))
+
+#define _mm256_maskz_extracti32x4_epi32(U, X, C) \
+ ((__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si)(__m256i) (X),\
+ (int) (C), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)(U)))
+
+#define _mm256_shuffle_i64x2(X, Y, C) \
+ ((__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di)(__m256i)(X), \
+ (__v4di)(__m256i)(Y), (int)(C), \
+ (__v4di)(__m256i)_mm256_setzero_si256 (), \
+ (__mmask8)-1))
+
+#define _mm256_mask_shuffle_i64x2(W, U, X, Y, C) \
+ ((__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di)(__m256i)(X), \
+ (__v4di)(__m256i)(Y), (int)(C), \
+ (__v4di)(__m256i)(W),\
+ (__mmask8)(U)))
+
+#define _mm256_maskz_shuffle_i64x2(U, X, Y, C) \
+ ((__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di)(__m256i)(X), \
+ (__v4di)(__m256i)(Y), (int)(C), \
+ (__v4di)(__m256i)_mm256_setzero_si256 (), \
+ (__mmask8)(U)))
+
+#define _mm256_shuffle_i32x4(X, Y, C) \
+ ((__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si)(__m256i)(X), \
+ (__v8si)(__m256i)(Y), (int)(C), \
+ (__v8si)(__m256i) \
+ _mm256_setzero_si256 (), \
+ (__mmask8)-1))
+
+#define _mm256_mask_shuffle_i32x4(W, U, X, Y, C) \
+ ((__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si)(__m256i)(X), \
+ (__v8si)(__m256i)(Y), (int)(C), \
+ (__v8si)(__m256i)(W), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_shuffle_i32x4(U, X, Y, C) \
+ ((__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si)(__m256i)(X), \
+ (__v8si)(__m256i)(Y), (int)(C), \
+ (__v8si)(__m256i) \
+ _mm256_setzero_si256 (), \
+ (__mmask8)(U)))
+
+#define _mm256_shuffle_f64x2(X, Y, C) \
+ ((__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df)(__m256d)(X), \
+ (__v4df)(__m256d)(Y), (int)(C), \
+ (__v4df)(__m256d)_mm256_setzero_pd (),\
+ (__mmask8)-1))
+
+#define _mm256_mask_shuffle_f64x2(W, U, X, Y, C) \
+ ((__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df)(__m256d)(X), \
+ (__v4df)(__m256d)(Y), (int)(C), \
+ (__v4df)(__m256d)(W), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_shuffle_f64x2(U, X, Y, C) \
+ ((__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df)(__m256d)(X), \
+ (__v4df)(__m256d)(Y), (int)(C), \
+ (__v4df)(__m256d)_mm256_setzero_pd( ),\
+ (__mmask8)(U)))
+
+#define _mm256_shuffle_f32x4(X, Y, C) \
+ ((__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf)(__m256)(X), \
+ (__v8sf)(__m256)(Y), (int)(C), \
+ (__v8sf)(__m256)_mm256_setzero_ps (), \
+ (__mmask8)-1))
+
+#define _mm256_mask_shuffle_f32x4(W, U, X, Y, C) \
+ ((__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf)(__m256)(X), \
+ (__v8sf)(__m256)(Y), (int)(C), \
+ (__v8sf)(__m256)(W), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_shuffle_f32x4(U, X, Y, C) \
+ ((__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf)(__m256)(X), \
+ (__v8sf)(__m256)(Y), (int)(C), \
+ (__v8sf)(__m256)_mm256_setzero_ps (), \
+ (__mmask8)(U)))
+
+#define _mm256_mask_shuffle_pd(W, U, A, B, C) \
+ ((__m256d)__builtin_ia32_shufpd256_mask ((__v4df)(__m256d)(A), \
+ (__v4df)(__m256d)(B), (int)(C), \
+ (__v4df)(__m256d)(W), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_shuffle_pd(U, A, B, C) \
+ ((__m256d)__builtin_ia32_shufpd256_mask ((__v4df)(__m256d)(A), \
+ (__v4df)(__m256d)(B), (int)(C), \
+ (__v4df)(__m256d) \
+ _mm256_setzero_pd (), \
+ (__mmask8)(U)))
+
+#define _mm_mask_shuffle_pd(W, U, A, B, C) \
+ ((__m128d)__builtin_ia32_shufpd128_mask ((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), (int)(C), \
+ (__v2df)(__m128d)(W), \
+ (__mmask8)(U)))
+
+#define _mm_maskz_shuffle_pd(U, A, B, C) \
+ ((__m128d)__builtin_ia32_shufpd128_mask ((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), (int)(C), \
+ (__v2df)(__m128d)_mm_setzero_pd (), \
+ (__mmask8)(U)))
+
+#define _mm256_mask_shuffle_ps(W, U, A, B, C) \
+ ((__m256) __builtin_ia32_shufps256_mask ((__v8sf)(__m256)(A), \
+ (__v8sf)(__m256)(B), (int)(C), \
+ (__v8sf)(__m256)(W), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_shuffle_ps(U, A, B, C) \
+ ((__m256) __builtin_ia32_shufps256_mask ((__v8sf)(__m256)(A), \
+ (__v8sf)(__m256)(B), (int)(C), \
+ (__v8sf)(__m256)_mm256_setzero_ps (),\
+ (__mmask8)(U)))
+
+#define _mm_mask_shuffle_ps(W, U, A, B, C) \
+ ((__m128) __builtin_ia32_shufps128_mask ((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), (int)(C), \
+ (__v4sf)(__m128)(W), \
+ (__mmask8)(U)))
+
+#define _mm_maskz_shuffle_ps(U, A, B, C) \
+ ((__m128) __builtin_ia32_shufps128_mask ((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), (int)(C), \
+ (__v4sf)(__m128)_mm_setzero_ps (), \
+ (__mmask8)(U)))
+
+#define _mm256_fixupimm_pd(X, Y, Z, C) \
+ ((__m256d)__builtin_ia32_fixupimmpd256_mask ((__v4df)(__m256d)(X), \
+ (__v4df)(__m256d)(Y), \
+ (__v4di)(__m256i)(Z), (int)(C), \
+ (__mmask8)(-1)))
+
+#define _mm256_mask_fixupimm_pd(X, U, Y, Z, C) \
+ ((__m256d)__builtin_ia32_fixupimmpd256_mask ((__v4df)(__m256d)(X), \
+ (__v4df)(__m256d)(Y), \
+ (__v4di)(__m256i)(Z), (int)(C), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_fixupimm_pd(U, X, Y, Z, C) \
+ ((__m256d)__builtin_ia32_fixupimmpd256_maskz ((__v4df)(__m256d)(X), \
+ (__v4df)(__m256d)(Y), \
+ (__v4di)(__m256i)(Z), (int)(C),\
+ (__mmask8)(U)))
+
+#define _mm256_fixupimm_ps(X, Y, Z, C) \
+ ((__m256)__builtin_ia32_fixupimmps256_mask ((__v8sf)(__m256)(X), \
+ (__v8sf)(__m256)(Y), \
+ (__v8si)(__m256i)(Z), (int)(C), \
+ (__mmask8)(-1)))
+
+
+#define _mm256_mask_fixupimm_ps(X, U, Y, Z, C) \
+ ((__m256)__builtin_ia32_fixupimmps256_mask ((__v8sf)(__m256)(X), \
+ (__v8sf)(__m256)(Y), \
+ (__v8si)(__m256i)(Z), (int)(C), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_fixupimm_ps(U, X, Y, Z, C) \
+ ((__m256)__builtin_ia32_fixupimmps256_maskz ((__v8sf)(__m256)(X), \
+ (__v8sf)(__m256)(Y), \
+ (__v8si)(__m256i)(Z), (int)(C),\
+ (__mmask8)(U)))
+
+#define _mm_fixupimm_pd(X, Y, Z, C) \
+ ((__m128d)__builtin_ia32_fixupimmpd128_mask ((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), \
+ (__v2di)(__m128i)(Z), (int)(C), \
+ (__mmask8)(-1)))
+
+
+#define _mm_mask_fixupimm_pd(X, U, Y, Z, C) \
+ ((__m128d)__builtin_ia32_fixupimmpd128_mask ((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), \
+ (__v2di)(__m128i)(Z), (int)(C), \
+ (__mmask8)(U)))
+
+#define _mm_maskz_fixupimm_pd(U, X, Y, Z, C) \
+ ((__m128d)__builtin_ia32_fixupimmpd128_maskz ((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), \
+ (__v2di)(__m128i)(Z), (int)(C),\
+ (__mmask8)(U)))
+
+#define _mm_fixupimm_ps(X, Y, Z, C) \
+ ((__m128)__builtin_ia32_fixupimmps128_mask ((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), \
+ (__v4si)(__m128i)(Z), (int)(C), \
+ (__mmask8)(-1)))
+
+#define _mm_mask_fixupimm_ps(X, U, Y, Z, C) \
+ ((__m128)__builtin_ia32_fixupimmps128_mask ((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), \
+ (__v4si)(__m128i)(Z), (int)(C),\
+ (__mmask8)(U)))
+
+#define _mm_maskz_fixupimm_ps(U, X, Y, Z, C) \
+ ((__m128)__builtin_ia32_fixupimmps128_maskz ((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), \
+ (__v4si)(__m128i)(Z), (int)(C),\
+ (__mmask8)(U)))
+
+#define _mm256_mask_srli_epi32(W, U, A, B) \
+ ((__m256i) __builtin_ia32_psrldi256_mask ((__v8si)(__m256i)(A), \
+ (int)(B), (__v8si)(__m256i)(W), (__mmask8)(U)))
+
+#define _mm256_maskz_srli_epi32(U, A, B) \
+ ((__m256i) __builtin_ia32_psrldi256_mask ((__v8si)(__m256i)(A), \
+ (int)(B), (__v8si)_mm256_setzero_si256 (), (__mmask8)(U)))
+
+#define _mm_mask_srli_epi32(W, U, A, B) \
+ ((__m128i) __builtin_ia32_psrldi128_mask ((__v4si)(__m128i)(A), \
+ (int)(B), (__v4si)(__m128i)(W), (__mmask8)(U)))
+
+#define _mm_maskz_srli_epi32(U, A, B) \
+ ((__m128i) __builtin_ia32_psrldi128_mask ((__v4si)(__m128i)(A), \
+ (int)(B), (__v4si)_mm_setzero_si128 (), (__mmask8)(U)))
+
+#define _mm256_mask_srli_epi64(W, U, A, B) \
+ ((__m256i) __builtin_ia32_psrlqi256_mask ((__v4di)(__m256i)(A), \
+ (int)(B), (__v4di)(__m256i)(W), (__mmask8)(U)))
+
+#define _mm256_maskz_srli_epi64(U, A, B) \
+ ((__m256i) __builtin_ia32_psrlqi256_mask ((__v4di)(__m256i)(A), \
+ (int)(B), (__v4di)_mm256_setzero_si256 (), (__mmask8)(U)))
+
+#define _mm_mask_srli_epi64(W, U, A, B) \
+ ((__m128i) __builtin_ia32_psrlqi128_mask ((__v2di)(__m128i)(A), \
+ (int)(B), (__v2di)(__m128i)(W), (__mmask8)(U)))
+
+#define _mm_maskz_srli_epi64(U, A, B) \
+ ((__m128i) __builtin_ia32_psrlqi128_mask ((__v2di)(__m128i)(A), \
+ (int)(B), (__v2di)_mm_setzero_si128 (), (__mmask8)(U)))
+
+#define _mm256_mask_slli_epi32(W, U, X, C) \
+ ((__m256i)__builtin_ia32_pslldi256_mask ((__v8si)(__m256i)(X), (int)(C),\
+ (__v8si)(__m256i)(W), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_slli_epi32(U, X, C) \
+ ((__m256i)__builtin_ia32_pslldi256_mask ((__v8si)(__m256i)(X), (int)(C),\
+ (__v8si)(__m256i)_mm256_setzero_si256 (), \
+ (__mmask8)(U)))
+
+#define _mm256_mask_slli_epi64(W, U, X, C) \
+ ((__m256i)__builtin_ia32_psllqi256_mask ((__v4di)(__m256i)(X), (int)(C),\
+ (__v4di)(__m256i)(W), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_slli_epi64(U, X, C) \
+ ((__m256i)__builtin_ia32_psllqi256_mask ((__v4di)(__m256i)(X), (int)(C),\
+ (__v4di)(__m256i)_mm256_setzero_si256 (), \
+ (__mmask8)(U)))
+
+#define _mm_mask_slli_epi32(W, U, X, C) \
+ ((__m128i)__builtin_ia32_pslldi128_mask ((__v4si)(__m128i)(X), (int)(C),\
+ (__v4si)(__m128i)(W),\
+ (__mmask8)(U)))
+
+#define _mm_maskz_slli_epi32(U, X, C) \
+ ((__m128i)__builtin_ia32_pslldi128_mask ((__v4si)(__m128i)(X), (int)(C),\
+ (__v4si)(__m128i)_mm_setzero_si128 (),\
+ (__mmask8)(U)))
+
+#define _mm_mask_slli_epi64(W, U, X, C) \
+ ((__m128i)__builtin_ia32_psllqi128_mask ((__v2di)(__m128i)(X), (int)(C),\
+ (__v2di)(__m128i)(W),\
+ (__mmask8)(U)))
+
+#define _mm_maskz_slli_epi64(U, X, C) \
+ ((__m128i)__builtin_ia32_psllqi128_mask ((__v2di)(__m128i)(X), (int)(C),\
+ (__v2di)(__m128i)_mm_setzero_si128 (),\
+ (__mmask8)(U)))
+
+#define _mm256_ternarylogic_epi64(A, B, C, I) \
+ ((__m256i) \
+ __builtin_ia32_pternlogq256_mask ((__v4di) (__m256i) (A), \
+ (__v4di) (__m256i) (B), \
+ (__v4di) (__m256i) (C), \
+ (unsigned char) (I), \
+ (__mmask8) -1))
+
+#define _mm256_mask_ternarylogic_epi64(A, U, B, C, I) \
+ ((__m256i) \
+ __builtin_ia32_pternlogq256_mask ((__v4di) (__m256i) (A), \
+ (__v4di) (__m256i) (B), \
+ (__v4di) (__m256i) (C), \
+ (unsigned char) (I), \
+ (__mmask8) (U)))
+
+#define _mm256_maskz_ternarylogic_epi64(U, A, B, C, I) \
+ ((__m256i) \
+ __builtin_ia32_pternlogq256_maskz ((__v4di) (__m256i) (A), \
+ (__v4di) (__m256i) (B), \
+ (__v4di) (__m256i) (C), \
+ (unsigned char) (I), \
+ (__mmask8) (U)))
+
+#define _mm256_ternarylogic_epi32(A, B, C, I) \
+ ((__m256i) \
+ __builtin_ia32_pternlogd256_mask ((__v8si) (__m256i) (A), \
+ (__v8si) (__m256i) (B), \
+ (__v8si) (__m256i) (C), \
+ (unsigned char) (I), \
+ (__mmask8) -1))
+
+#define _mm256_mask_ternarylogic_epi32(A, U, B, C, I) \
+ ((__m256i) \
+ __builtin_ia32_pternlogd256_mask ((__v8si) (__m256i) (A), \
+ (__v8si) (__m256i) (B), \
+ (__v8si) (__m256i) (C), \
+ (unsigned char) (I), \
+ (__mmask8) (U)))
+
+#define _mm256_maskz_ternarylogic_epi32(U, A, B, C, I) \
+ ((__m256i) \
+ __builtin_ia32_pternlogd256_maskz ((__v8si) (__m256i) (A), \
+ (__v8si) (__m256i) (B), \
+ (__v8si) (__m256i) (C), \
+ (unsigned char) (I), \
+ (__mmask8) (U)))
+
+#define _mm_ternarylogic_epi64(A, B, C, I) \
+ ((__m128i) \
+ __builtin_ia32_pternlogq128_mask ((__v2di) (__m128i) (A), \
+ (__v2di) (__m128i) (B), \
+ (__v2di) (__m128i) (C), \
+ (unsigned char) (I), \
+ (__mmask8) -1))
+
+#define _mm_mask_ternarylogic_epi64(A, U, B, C, I) \
+ ((__m128i) \
+ __builtin_ia32_pternlogq128_mask ((__v2di) (__m128i) (A), \
+ (__v2di) (__m128i) (B), \
+ (__v2di) (__m128i) (C), \
+ (unsigned char) (I), \
+ (__mmask8) (U)))
+
+#define _mm_maskz_ternarylogic_epi64(U, A, B, C, I) \
+ ((__m128i) \
+ __builtin_ia32_pternlogq128_maskz ((__v2di) (__m128i) (A), \
+ (__v2di) (__m128i) (B), \
+ (__v2di) (__m128i) (C), \
+ (unsigned char) (I), \
+ (__mmask8) (U)))
+
+#define _mm_ternarylogic_epi32(A, B, C, I) \
+ ((__m128i) \
+ __builtin_ia32_pternlogd128_mask ((__v4si) (__m128i) (A), \
+ (__v4si) (__m128i) (B), \
+ (__v4si) (__m128i) (C), \
+ (unsigned char) (I), \
+ (__mmask8) -1))
+
+#define _mm_mask_ternarylogic_epi32(A, U, B, C, I) \
+ ((__m128i) \
+ __builtin_ia32_pternlogd128_mask ((__v4si) (__m128i) (A), \
+ (__v4si) (__m128i) (B), \
+ (__v4si) (__m128i) (C), \
+ (unsigned char) (I), \
+ (__mmask8) (U)))
+
+#define _mm_maskz_ternarylogic_epi32(U, A, B, C, I) \
+ ((__m128i) \
+ __builtin_ia32_pternlogd128_maskz ((__v4si) (__m128i) (A), \
+ (__v4si) (__m128i) (B), \
+ (__v4si) (__m128i) (C), \
+ (unsigned char) (I), \
+ (__mmask8) (U)))
+
+#define _mm256_roundscale_ps(A, B) \
+ ((__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf)(__m256)(A), \
+ (int)(B), (__v8sf)(__m256)_mm256_setzero_ps (), (__mmask8)-1))
+
+#define _mm256_mask_roundscale_ps(W, U, A, B) \
+ ((__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf)(__m256)(A), \
+ (int)(B), (__v8sf)(__m256)(W), (__mmask8)(U)))
+
+#define _mm256_maskz_roundscale_ps(U, A, B) \
+ ((__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf)(__m256)(A), \
+ (int)(B), (__v8sf)(__m256)_mm256_setzero_ps (), (__mmask8)(U)))
+
+#define _mm256_roundscale_pd(A, B) \
+ ((__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df)(__m256d)(A), \
+ (int)(B), (__v4df)(__m256d)_mm256_setzero_pd (), (__mmask8)-1))
+
+#define _mm256_mask_roundscale_pd(W, U, A, B) \
+ ((__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df)(__m256d)(A), \
+ (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U)))
+
+#define _mm256_maskz_roundscale_pd(U, A, B) \
+ ((__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df)(__m256d)(A), \
+ (int)(B), (__v4df)(__m256d)_mm256_setzero_pd (), (__mmask8)(U)))
+
+#define _mm_roundscale_ps(A, B) \
+ ((__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf)(__m128)(A), \
+ (int)(B), (__v4sf)(__m128)_mm_setzero_ps (), (__mmask8)-1))
+
+#define _mm_mask_roundscale_ps(W, U, A, B) \
+ ((__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf)(__m128)(A), \
+ (int)(B), (__v4sf)(__m128)(W), (__mmask8)(U)))
+
+#define _mm_maskz_roundscale_ps(U, A, B) \
+ ((__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf)(__m128)(A), \
+ (int)(B), (__v4sf)(__m128)_mm_setzero_ps (), (__mmask8)(U)))
+
+#define _mm_roundscale_pd(A, B) \
+ ((__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df)(__m128d)(A), \
+ (int)(B), (__v2df)(__m128d)_mm_setzero_pd (), (__mmask8)-1))
+
+#define _mm_mask_roundscale_pd(W, U, A, B) \
+ ((__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df)(__m128d)(A), \
+ (int)(B), (__v2df)(__m128d)(W), (__mmask8)(U)))
+
+#define _mm_maskz_roundscale_pd(U, A, B) \
+ ((__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df)(__m128d)(A), \
+ (int)(B), (__v2df)(__m128d)_mm_setzero_pd (), (__mmask8)(U)))
+
+#define _mm256_getmant_ps(X, B, C) \
+ ((__m256) __builtin_ia32_getmantps256_mask ((__v8sf)(__m256) (X), \
+ (int)(((C)<<2) | (B)), \
+ (__v8sf)(__m256)_mm256_setzero_ps (), \
+ (__mmask8)-1))
+
+#define _mm256_mask_getmant_ps(W, U, X, B, C) \
+ ((__m256) __builtin_ia32_getmantps256_mask ((__v8sf)(__m256) (X), \
+ (int)(((C)<<2) | (B)), \
+ (__v8sf)(__m256)(W), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_getmant_ps(U, X, B, C) \
+ ((__m256) __builtin_ia32_getmantps256_mask ((__v8sf)(__m256) (X), \
+ (int)(((C)<<2) | (B)), \
+ (__v8sf)(__m256)_mm256_setzero_ps (), \
+ (__mmask8)(U)))
+
+#define _mm_getmant_ps(X, B, C) \
+ ((__m128) __builtin_ia32_getmantps128_mask ((__v4sf)(__m128) (X), \
+ (int)(((C)<<2) | (B)), \
+ (__v4sf)(__m128)_mm_setzero_ps (), \
+ (__mmask8)-1))
+
+#define _mm_mask_getmant_ps(W, U, X, B, C) \
+ ((__m128) __builtin_ia32_getmantps128_mask ((__v4sf)(__m128) (X), \
+ (int)(((C)<<2) | (B)), \
+ (__v4sf)(__m128)(W), \
+ (__mmask8)(U)))
+
+#define _mm_maskz_getmant_ps(U, X, B, C) \
+ ((__m128) __builtin_ia32_getmantps128_mask ((__v4sf)(__m128) (X), \
+ (int)(((C)<<2) | (B)), \
+ (__v4sf)(__m128)_mm_setzero_ps (), \
+ (__mmask8)(U)))
+
+#define _mm256_getmant_pd(X, B, C) \
+ ((__m256d) __builtin_ia32_getmantpd256_mask ((__v4df)(__m256d) (X), \
+ (int)(((C)<<2) | (B)), \
+ (__v4df)(__m256d)_mm256_setzero_pd (),\
+ (__mmask8)-1))
+
+#define _mm256_mask_getmant_pd(W, U, X, B, C) \
+ ((__m256d) __builtin_ia32_getmantpd256_mask ((__v4df)(__m256d) (X), \
+ (int)(((C)<<2) | (B)), \
+ (__v4df)(__m256d)(W), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_getmant_pd(U, X, B, C) \
+ ((__m256d) __builtin_ia32_getmantpd256_mask ((__v4df)(__m256d) (X), \
+ (int)(((C)<<2) | (B)), \
+ (__v4df)(__m256d)_mm256_setzero_pd (),\
+ (__mmask8)(U)))
+
+#define _mm_getmant_pd(X, B, C) \
+ ((__m128d) __builtin_ia32_getmantpd128_mask ((__v2df)(__m128d) (X), \
+ (int)(((C)<<2) | (B)), \
+ (__v2df)(__m128d)_mm_setzero_pd (), \
+ (__mmask8)-1))
+
+#define _mm_mask_getmant_pd(W, U, X, B, C) \
+ ((__m128d) __builtin_ia32_getmantpd128_mask ((__v2df)(__m128d) (X), \
+ (int)(((C)<<2) | (B)), \
+ (__v2df)(__m128d)(W), \
+ (__mmask8)(U)))
+
+#define _mm_maskz_getmant_pd(U, X, B, C) \
+ ((__m128d) __builtin_ia32_getmantpd128_mask ((__v2df)(__m128d) (X), \
+ (int)(((C)<<2) | (B)), \
+ (__v2df)(__m128d)_mm_setzero_pd (), \
+ (__mmask8)(U)))
+
+#define _mm256_mmask_i32gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) \
+ (__m256) __builtin_ia32_gather3siv8sf ((__v8sf)(__m256) (V1OLD), \
+ (void const *) (ADDR), \
+ (__v8si)(__m256i) (INDEX), \
+ (__mmask8) (MASK), \
+ (int) (SCALE))
+
+#define _mm_mmask_i32gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) \
+ (__m128) __builtin_ia32_gather3siv4sf ((__v4sf)(__m128) (V1OLD), \
+ (void const *) (ADDR), \
+ (__v4si)(__m128i) (INDEX), \
+ (__mmask8) (MASK), \
+ (int) (SCALE))
+
+#define _mm256_mmask_i32gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) \
+ (__m256d) __builtin_ia32_gather3siv4df ((__v4df)(__m256d) (V1OLD), \
+ (void const *) (ADDR), \
+ (__v4si)(__m128i) (INDEX), \
+ (__mmask8) (MASK), \
+ (int) (SCALE))
+
+#define _mm_mmask_i32gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) \
+ (__m128d) __builtin_ia32_gather3siv2df ((__v2df)(__m128d) (V1OLD), \
+ (void const *) (ADDR), \
+ (__v4si)(__m128i) (INDEX), \
+ (__mmask8) (MASK), \
+ (int) (SCALE))
+
+#define _mm256_mmask_i64gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) \
+ (__m128) __builtin_ia32_gather3div8sf ((__v4sf)(__m128) (V1OLD), \
+ (void const *) (ADDR), \
+ (__v4di)(__m256i) (INDEX), \
+ (__mmask8) (MASK), \
+ (int) (SCALE))
+
+#define _mm_mmask_i64gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) \
+ (__m128) __builtin_ia32_gather3div4sf ((__v4sf)(__m128) (V1OLD), \
+ (void const *) (ADDR), \
+ (__v2di)(__m128i) (INDEX), \
+ (__mmask8) (MASK), \
+ (int) (SCALE))
+
+#define _mm256_mmask_i64gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) \
+ (__m256d) __builtin_ia32_gather3div4df ((__v4df)(__m256d) (V1OLD), \
+ (void const *) (ADDR), \
+ (__v4di)(__m256i) (INDEX), \
+ (__mmask8) (MASK), \
+ (int) (SCALE))
+
+#define _mm_mmask_i64gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) \
+ (__m128d) __builtin_ia32_gather3div2df ((__v2df)(__m128d) (V1OLD), \
+ (void const *) (ADDR), \
+ (__v2di)(__m128i) (INDEX), \
+ (__mmask8) (MASK), \
+ (int) (SCALE))
+
+#define _mm256_mmask_i32gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) \
+ (__m256i) __builtin_ia32_gather3siv8si ((__v8si)(__m256i) (V1OLD), \
+ (void const *) (ADDR), \
+ (__v8si)(__m256i) (INDEX), \
+ (__mmask8) (MASK), \
+ (int) (SCALE))
+
+#define _mm_mmask_i32gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) \
+ (__m128i) __builtin_ia32_gather3siv4si ((__v4si)(__m128i) (V1OLD), \
+ (void const *) (ADDR), \
+ (__v4si)(__m128i) (INDEX), \
+ (__mmask8) (MASK), \
+ (int) (SCALE))
+
+#define _mm256_mmask_i32gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) \
+ (__m256i) __builtin_ia32_gather3siv4di ((__v4di)(__m256i) (V1OLD), \
+ (void const *) (ADDR), \
+ (__v4si)(__m128i) (INDEX), \
+ (__mmask8) (MASK), \
+ (int) (SCALE))
+
+#define _mm_mmask_i32gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) \
+ (__m128i) __builtin_ia32_gather3siv2di ((__v2di)(__m128i) (V1OLD), \
+ (void const *) (ADDR), \
+ (__v4si)(__m128i) (INDEX), \
+ (__mmask8) (MASK), \
+ (int) (SCALE))
+
+#define _mm256_mmask_i64gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) \
+ (__m128i) __builtin_ia32_gather3div8si ((__v4si)(__m128i) (V1OLD), \
+ (void const *) (ADDR), \
+ (__v4di)(__m256i) (INDEX), \
+ (__mmask8) (MASK), \
+ (int) (SCALE))
+
+#define _mm_mmask_i64gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) \
+ (__m128i) __builtin_ia32_gather3div4si ((__v4si)(__m128i) (V1OLD), \
+ (void const *) (ADDR), \
+ (__v2di)(__m128i) (INDEX), \
+ (__mmask8) (MASK), \
+ (int) (SCALE))
+
+#define _mm256_mmask_i64gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) \
+ (__m256i) __builtin_ia32_gather3div4di ((__v4di)(__m256i) (V1OLD), \
+ (void const *) (ADDR), \
+ (__v4di)(__m256i) (INDEX), \
+ (__mmask8) (MASK), \
+ (int) (SCALE))
+
+#define _mm_mmask_i64gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) \
+ (__m128i) __builtin_ia32_gather3div2di ((__v2di)(__m128i) (V1OLD), \
+ (void const *) (ADDR), \
+ (__v2di)(__m128i) (INDEX), \
+ (__mmask8) (MASK), \
+ (int) (SCALE))
+
+#define _mm256_i32scatter_ps(ADDR, INDEX, V1, SCALE) \
+ __builtin_ia32_scattersiv8sf ((void *) (ADDR), (__mmask8)0xFF, \
+ (__v8si)(__m256i) (INDEX), \
+ (__v8sf)(__m256) (V1), (int) (SCALE))
+
+#define _mm256_mask_i32scatter_ps(ADDR, MASK, INDEX, V1, SCALE) \
+ __builtin_ia32_scattersiv8sf ((void *) (ADDR), (__mmask8) (MASK), \
+ (__v8si)(__m256i) (INDEX), \
+ (__v8sf)(__m256) (V1), (int) (SCALE))
+
+#define _mm_i32scatter_ps(ADDR, INDEX, V1, SCALE) \
+ __builtin_ia32_scattersiv4sf ((void *) (ADDR), (__mmask8)0xFF, \
+ (__v4si)(__m128i) (INDEX), \
+ (__v4sf)(__m128) (V1), (int) (SCALE))
+
+#define _mm_mask_i32scatter_ps(ADDR, MASK, INDEX, V1, SCALE) \
+ __builtin_ia32_scattersiv4sf ((void *) (ADDR), (__mmask8) (MASK), \
+ (__v4si)(__m128i) (INDEX), \
+ (__v4sf)(__m128) (V1), (int) (SCALE))
+
+#define _mm256_i32scatter_pd(ADDR, INDEX, V1, SCALE) \
+ __builtin_ia32_scattersiv4df ((void *) (ADDR), (__mmask8)0xFF, \
+ (__v4si)(__m128i) (INDEX), \
+ (__v4df)(__m256d) (V1), (int) (SCALE))
+
+#define _mm256_mask_i32scatter_pd(ADDR, MASK, INDEX, V1, SCALE) \
+ __builtin_ia32_scattersiv4df ((void *) (ADDR), (__mmask8) (MASK), \
+ (__v4si)(__m128i) (INDEX), \
+ (__v4df)(__m256d) (V1), (int) (SCALE))
+
+#define _mm_i32scatter_pd(ADDR, INDEX, V1, SCALE) \
+ __builtin_ia32_scattersiv2df ((void *) (ADDR), (__mmask8)0xFF, \
+ (__v4si)(__m128i) (INDEX), \
+ (__v2df)(__m128d) (V1), (int) (SCALE))
+
+#define _mm_mask_i32scatter_pd(ADDR, MASK, INDEX, V1, SCALE) \
+ __builtin_ia32_scattersiv2df ((void *) (ADDR), (__mmask8) (MASK), \
+ (__v4si)(__m128i) (INDEX), \
+ (__v2df)(__m128d) (V1), (int) (SCALE))
+
+#define _mm256_i64scatter_ps(ADDR, INDEX, V1, SCALE) \
+ __builtin_ia32_scatterdiv8sf ((void *) (ADDR), (__mmask8)0xFF, \
+ (__v4di)(__m256i) (INDEX), \
+ (__v4sf)(__m128) (V1), (int) (SCALE))
+
+#define _mm256_mask_i64scatter_ps(ADDR, MASK, INDEX, V1, SCALE) \
+ __builtin_ia32_scatterdiv8sf ((void *) (ADDR), (__mmask8) (MASK), \
+ (__v4di)(__m256i) (INDEX), \
+ (__v4sf)(__m128) (V1), (int) (SCALE))
+
+#define _mm_i64scatter_ps(ADDR, INDEX, V1, SCALE) \
+ __builtin_ia32_scatterdiv4sf ((void *) (ADDR), (__mmask8)0xFF, \
+ (__v2di)(__m128i) (INDEX), \
+ (__v4sf)(__m128) (V1), (int) (SCALE))
+
+#define _mm_mask_i64scatter_ps(ADDR, MASK, INDEX, V1, SCALE) \
+ __builtin_ia32_scatterdiv4sf ((void *) (ADDR), (__mmask8) (MASK), \
+ (__v2di)(__m128i) (INDEX), \
+ (__v4sf)(__m128) (V1), (int) (SCALE))
+
+#define _mm256_i64scatter_pd(ADDR, INDEX, V1, SCALE) \
+ __builtin_ia32_scatterdiv4df ((void *) (ADDR), (__mmask8)0xFF, \
+ (__v4di)(__m256i) (INDEX), \
+ (__v4df)(__m256d) (V1), (int) (SCALE))
+
+#define _mm256_mask_i64scatter_pd(ADDR, MASK, INDEX, V1, SCALE) \
+ __builtin_ia32_scatterdiv4df ((void *) (ADDR), (__mmask8) (MASK), \
+ (__v4di)(__m256i) (INDEX), \
+ (__v4df)(__m256d) (V1), (int) (SCALE))
+
+#define _mm_i64scatter_pd(ADDR, INDEX, V1, SCALE) \
+ __builtin_ia32_scatterdiv2df ((void *) (ADDR), (__mmask8)0xFF, \
+ (__v2di)(__m128i) (INDEX), \
+ (__v2df)(__m128d) (V1), (int) (SCALE))
+
+#define _mm_mask_i64scatter_pd(ADDR, MASK, INDEX, V1, SCALE) \
+ __builtin_ia32_scatterdiv2df ((void *) (ADDR), (__mmask8) (MASK), \
+ (__v2di)(__m128i) (INDEX), \
+ (__v2df)(__m128d) (V1), (int) (SCALE))
+
+#define _mm256_i32scatter_epi32(ADDR, INDEX, V1, SCALE) \
+ __builtin_ia32_scattersiv8si ((void *) (ADDR), (__mmask8)0xFF, \
+ (__v8si)(__m256i) (INDEX), \
+ (__v8si)(__m256i) (V1), (int) (SCALE))
+
+#define _mm256_mask_i32scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) \
+ __builtin_ia32_scattersiv8si ((void *) (ADDR), (__mmask8) (MASK), \
+ (__v8si)(__m256i) (INDEX), \
+ (__v8si)(__m256i) (V1), (int) (SCALE))
+
+#define _mm_i32scatter_epi32(ADDR, INDEX, V1, SCALE) \
+ __builtin_ia32_scattersiv4si ((void *) (ADDR), (__mmask8)0xFF, \
+ (__v4si)(__m128i) (INDEX), \
+ (__v4si)(__m128i) (V1), (int) (SCALE))
+
+#define _mm_mask_i32scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) \
+ __builtin_ia32_scattersiv4si ((void *) (ADDR), (__mmask8) (MASK), \
+ (__v4si)(__m128i) (INDEX), \
+ (__v4si)(__m128i) (V1), (int) (SCALE))
+
+#define _mm256_i32scatter_epi64(ADDR, INDEX, V1, SCALE) \
+ __builtin_ia32_scattersiv4di ((void *) (ADDR), (__mmask8)0xFF, \
+ (__v4si)(__m128i) (INDEX), \
+ (__v4di)(__m256i) (V1), (int) (SCALE))
+
+#define _mm256_mask_i32scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) \
+ __builtin_ia32_scattersiv4di ((void *) (ADDR), (__mmask8) (MASK), \
+ (__v4si)(__m128i) (INDEX), \
+ (__v4di)(__m256i) (V1), (int) (SCALE))
+
+#define _mm_i32scatter_epi64(ADDR, INDEX, V1, SCALE) \
+ __builtin_ia32_scattersiv2di ((void *) (ADDR), (__mmask8)0xFF, \
+ (__v4si)(__m128i) (INDEX), \
+ (__v2di)(__m128i) (V1), (int) (SCALE))
+
+#define _mm_mask_i32scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) \
+ __builtin_ia32_scattersiv2di ((void *) (ADDR), (__mmask8) (MASK), \
+ (__v4si)(__m128i) (INDEX), \
+ (__v2di)(__m128i) (V1), (int) (SCALE))
+
+#define _mm256_i64scatter_epi32(ADDR, INDEX, V1, SCALE) \
+ __builtin_ia32_scatterdiv8si ((void *) (ADDR), (__mmask8)0xFF, \
+ (__v4di)(__m256i) (INDEX), \
+ (__v4si)(__m128i) (V1), (int) (SCALE))
+
+#define _mm256_mask_i64scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) \
+ __builtin_ia32_scatterdiv8si ((void *) (ADDR), (__mmask8) (MASK), \
+ (__v4di)(__m256i) (INDEX), \
+ (__v4si)(__m128i) (V1), (int) (SCALE))
+
+#define _mm_i64scatter_epi32(ADDR, INDEX, V1, SCALE) \
+ __builtin_ia32_scatterdiv4si ((void *) (ADDR), (__mmask8)0xFF, \
+ (__v2di)(__m128i) (INDEX), \
+ (__v4si)(__m128i) (V1), (int) (SCALE))
+
+#define _mm_mask_i64scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) \
+ __builtin_ia32_scatterdiv4si ((void *) (ADDR), (__mmask8) (MASK), \
+ (__v2di)(__m128i) (INDEX), \
+ (__v4si)(__m128i) (V1), (int) (SCALE))
+
+#define _mm256_i64scatter_epi64(ADDR, INDEX, V1, SCALE) \
+ __builtin_ia32_scatterdiv4di ((void *) (ADDR), (__mmask8)0xFF, \
+ (__v4di)(__m256i) (INDEX), \
+ (__v4di)(__m256i) (V1), (int) (SCALE))
+
+#define _mm256_mask_i64scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) \
+ __builtin_ia32_scatterdiv4di ((void *) (ADDR), (__mmask8) (MASK), \
+ (__v4di)(__m256i) (INDEX), \
+ (__v4di)(__m256i) (V1), (int) (SCALE))
+
+#define _mm_i64scatter_epi64(ADDR, INDEX, V1, SCALE) \
+ __builtin_ia32_scatterdiv2di ((void *) (ADDR), (__mmask8)0xFF, \
+ (__v2di)(__m128i) (INDEX), \
+ (__v2di)(__m128i) (V1), (int) (SCALE))
+
+#define _mm_mask_i64scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) \
+ __builtin_ia32_scatterdiv2di ((void *) (ADDR), (__mmask8) (MASK), \
+ (__v2di)(__m128i) (INDEX), \
+ (__v2di)(__m128i) (V1), (int) (SCALE))
+
+#define _mm256_mask_shuffle_epi32(W, U, X, C) \
+ ((__m256i) __builtin_ia32_pshufd256_mask ((__v8si)(__m256i)(X), (int)(C), \
+ (__v8si)(__m256i)(W), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_shuffle_epi32(U, X, C) \
+ ((__m256i) __builtin_ia32_pshufd256_mask ((__v8si)(__m256i)(X), (int)(C), \
+ (__v8si)(__m256i) \
+ _mm256_setzero_si256 (), \
+ (__mmask8)(U)))
+
+#define _mm_mask_shuffle_epi32(W, U, X, C) \
+ ((__m128i) __builtin_ia32_pshufd128_mask ((__v4si)(__m128i)(X), (int)(C), \
+ (__v4si)(__m128i)(W), \
+ (__mmask8)(U)))
+
+#define _mm_maskz_shuffle_epi32(U, X, C) \
+ ((__m128i) __builtin_ia32_pshufd128_mask ((__v4si)(__m128i)(X), (int)(C), \
+ (__v4si)(__m128i)_mm_setzero_si128 (), \
+ (__mmask8)(U)))
+
+#define _mm256_rol_epi64(A, B) \
+ ((__m256i)__builtin_ia32_prolq256_mask ((__v4di)(__m256i)(A), (int)(B), \
+ (__v4di)(__m256i)_mm256_setzero_si256 (),\
+ (__mmask8)-1))
+
+#define _mm256_mask_rol_epi64(W, U, A, B) \
+ ((__m256i)__builtin_ia32_prolq256_mask ((__v4di)(__m256i)(A), (int)(B), \
+ (__v4di)(__m256i)(W), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_rol_epi64(U, A, B) \
+ ((__m256i)__builtin_ia32_prolq256_mask ((__v4di)(__m256i)(A), (int)(B), \
+ (__v4di)(__m256i)_mm256_setzero_si256 (),\
+ (__mmask8)(U)))
+
+#define _mm_rol_epi64(A, B) \
+ ((__m128i)__builtin_ia32_prolq128_mask ((__v2di)(__m128i)(A), (int)(B), \
+ (__v2di)(__m128i)_mm_setzero_si128 (),\
+ (__mmask8)-1))
+
+#define _mm_mask_rol_epi64(W, U, A, B) \
+ ((__m128i)__builtin_ia32_prolq128_mask ((__v2di)(__m128i)(A), (int)(B), \
+ (__v2di)(__m128i)(W), \
+ (__mmask8)(U)))
+
+#define _mm_maskz_rol_epi64(U, A, B) \
+ ((__m128i)__builtin_ia32_prolq128_mask ((__v2di)(__m128i)(A), (int)(B), \
+ (__v2di)(__m128i)_mm_setzero_si128 (),\
+ (__mmask8)(U)))
+
+#define _mm256_ror_epi64(A, B) \
+ ((__m256i)__builtin_ia32_prorq256_mask ((__v4di)(__m256i)(A), (int)(B), \
+ (__v4di)(__m256i)_mm256_setzero_si256 (),\
+ (__mmask8)-1))
+
+#define _mm256_mask_ror_epi64(W, U, A, B) \
+ ((__m256i)__builtin_ia32_prorq256_mask ((__v4di)(__m256i)(A), (int)(B), \
+ (__v4di)(__m256i)(W), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_ror_epi64(U, A, B) \
+ ((__m256i)__builtin_ia32_prorq256_mask ((__v4di)(__m256i)(A), (int)(B), \
+ (__v4di)(__m256i)_mm256_setzero_si256 (),\
+ (__mmask8)(U)))
+
+#define _mm_ror_epi64(A, B) \
+ ((__m128i)__builtin_ia32_prorq128_mask ((__v2di)(__m128i)(A), (int)(B), \
+ (__v2di)(__m128i)_mm_setzero_si128 (),\
+ (__mmask8)-1))
+
+#define _mm_mask_ror_epi64(W, U, A, B) \
+ ((__m128i)__builtin_ia32_prorq128_mask ((__v2di)(__m128i)(A), (int)(B), \
+ (__v2di)(__m128i)(W), \
+ (__mmask8)(U)))
+
+#define _mm_maskz_ror_epi64(U, A, B) \
+ ((__m128i)__builtin_ia32_prorq128_mask ((__v2di)(__m128i)(A), (int)(B), \
+ (__v2di)(__m128i)_mm_setzero_si128 (),\
+ (__mmask8)(U)))
+
+#define _mm256_rol_epi32(A, B) \
+ ((__m256i)__builtin_ia32_prold256_mask ((__v8si)(__m256i)(A), (int)(B), \
+ (__v8si)(__m256i)_mm256_setzero_si256 (),\
+ (__mmask8)-1))
+
+#define _mm256_mask_rol_epi32(W, U, A, B) \
+ ((__m256i)__builtin_ia32_prold256_mask ((__v8si)(__m256i)(A), (int)(B), \
+ (__v8si)(__m256i)(W), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_rol_epi32(U, A, B) \
+ ((__m256i)__builtin_ia32_prold256_mask ((__v8si)(__m256i)(A), (int)(B), \
+ (__v8si)(__m256i)_mm256_setzero_si256 (),\
+ (__mmask8)(U)))
+
+#define _mm_rol_epi32(A, B) \
+ ((__m128i)__builtin_ia32_prold128_mask ((__v4si)(__m128i)(A), (int)(B), \
+ (__v4si)(__m128i)_mm_setzero_si128 (),\
+ (__mmask8)-1))
+
+#define _mm_mask_rol_epi32(W, U, A, B) \
+ ((__m128i)__builtin_ia32_prold128_mask ((__v4si)(__m128i)(A), (int)(B), \
+ (__v4si)(__m128i)(W), \
+ (__mmask8)(U)))
+
+#define _mm_maskz_rol_epi32(U, A, B) \
+ ((__m128i)__builtin_ia32_prold128_mask ((__v4si)(__m128i)(A), (int)(B), \
+ (__v4si)(__m128i)_mm_setzero_si128 (),\
+ (__mmask8)(U)))
+
+#define _mm256_ror_epi32(A, B) \
+ ((__m256i)__builtin_ia32_prord256_mask ((__v8si)(__m256i)(A), (int)(B), \
+ (__v8si)(__m256i)_mm256_setzero_si256 (),\
+ (__mmask8)-1))
+
+#define _mm256_mask_ror_epi32(W, U, A, B) \
+ ((__m256i)__builtin_ia32_prord256_mask ((__v8si)(__m256i)(A), (int)(B), \
+ (__v8si)(__m256i)(W), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_ror_epi32(U, A, B) \
+ ((__m256i)__builtin_ia32_prord256_mask ((__v8si)(__m256i)(A), (int)(B), \
+ (__v8si)(__m256i) \
+ _mm256_setzero_si256 (), \
+ (__mmask8)(U)))
+
+#define _mm_ror_epi32(A, B) \
+ ((__m128i)__builtin_ia32_prord128_mask ((__v4si)(__m128i)(A), (int)(B), \
+ (__v4si)(__m128i)_mm_setzero_si128 (),\
+ (__mmask8)-1))
+
+#define _mm_mask_ror_epi32(W, U, A, B) \
+ ((__m128i)__builtin_ia32_prord128_mask ((__v4si)(__m128i)(A), (int)(B), \
+ (__v4si)(__m128i)(W), \
+ (__mmask8)(U)))
+
+#define _mm_maskz_ror_epi32(U, A, B) \
+ ((__m128i)__builtin_ia32_prord128_mask ((__v4si)(__m128i)(A), (int)(B), \
+ (__v4si)(__m128i)_mm_setzero_si128 (),\
+ (__mmask8)(U)))
+
+#define _mm256_alignr_epi32(X, Y, C) \
+ ((__m256i)__builtin_ia32_alignd256_mask ((__v8si)(__m256i)(X), \
+ (__v8si)(__m256i)(Y), (int)(C), (__v8si)(__m256i)(X), (__mmask8)-1))
+
+#define _mm256_mask_alignr_epi32(W, U, X, Y, C) \
+ ((__m256i)__builtin_ia32_alignd256_mask ((__v8si)(__m256i)(X), \
+ (__v8si)(__m256i)(Y), (int)(C), (__v8si)(__m256i)(W), (__mmask8)(U)))
+
+#define _mm256_maskz_alignr_epi32(U, X, Y, C) \
+ ((__m256i)__builtin_ia32_alignd256_mask ((__v8si)(__m256i)(X), \
+ (__v8si)(__m256i)(Y), (int)(C), (__v8si)(__m256i)_mm256_setzero_si256 (),\
+ (__mmask8)(U)))
+
+#define _mm256_alignr_epi64(X, Y, C) \
+ ((__m256i)__builtin_ia32_alignq256_mask ((__v4di)(__m256i)(X), \
+ (__v4di)(__m256i)(Y), (int)(C), (__v4di)(__m256i)(X), (__mmask8)-1))
+
+#define _mm256_mask_alignr_epi64(W, U, X, Y, C) \
+ ((__m256i)__builtin_ia32_alignq256_mask ((__v4di)(__m256i)(X), \
+ (__v4di)(__m256i)(Y), (int)(C), (__v4di)(__m256i)(W), (__mmask8)(U)))
+
+#define _mm256_maskz_alignr_epi64(U, X, Y, C) \
+ ((__m256i)__builtin_ia32_alignq256_mask ((__v4di)(__m256i)(X), \
+ (__v4di)(__m256i)(Y), (int)(C), (__v4di)(__m256i)_mm256_setzero_si256 (),\
+ (__mmask8)(U)))
+
+#define _mm_alignr_epi32(X, Y, C) \
+ ((__m128i)__builtin_ia32_alignd128_mask ((__v4si)(__m128i)(X), \
+ (__v4si)(__m128i)(Y), (int)(C), (__v4si)(__m128i)(X), (__mmask8)-1))
+
+#define _mm_mask_alignr_epi32(W, U, X, Y, C) \
+ ((__m128i)__builtin_ia32_alignd128_mask ((__v4si)(__m128i)(X), \
+ (__v4si)(__m128i)(Y), (int)(C), (__v4si)(__m128i)(W), (__mmask8)(U)))
+
+#define _mm_maskz_alignr_epi32(U, X, Y, C) \
+ ((__m128i)__builtin_ia32_alignd128_mask ((__v4si)(__m128i)(X), \
+ (__v4si)(__m128i)(Y), (int)(C), (__v4si)(__m128i)_mm_setzero_si128 (),\
+ (__mmask8)(U)))
+
+#define _mm_alignr_epi64(X, Y, C) \
+ ((__m128i)__builtin_ia32_alignq128_mask ((__v2di)(__m128i)(X), \
+ (__v2di)(__m128i)(Y), (int)(C), (__v2di)(__m128i)(X), (__mmask8)-1))
+
+#define _mm_mask_alignr_epi64(W, U, X, Y, C) \
+ ((__m128i)__builtin_ia32_alignq128_mask ((__v2di)(__m128i)(X), \
+ (__v2di)(__m128i)(Y), (int)(C), (__v2di)(__m128i)(X), (__mmask8)-1))
+
+#define _mm_maskz_alignr_epi64(U, X, Y, C) \
+ ((__m128i)__builtin_ia32_alignq128_mask ((__v2di)(__m128i)(X), \
+ (__v2di)(__m128i)(Y), (int)(C), (__v2di)(__m128i)_mm_setzero_si128 (),\
+ (__mmask8)(U)))
+
+#define _mm_mask_cvtps_ph(W, U, A, I) \
+ ((__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf)(__m128) (A), (int) (I), \
+ (__v8hi)(__m128i) (W), (__mmask8) (U)))
+
+#define _mm_maskz_cvtps_ph(U, A, I) \
+ ((__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf)(__m128) (A), (int) (I), \
+ (__v8hi)(__m128i) _mm_setzero_si128 (), (__mmask8) (U)))
+
+#define _mm256_mask_cvtps_ph(W, U, A, I) \
+ ((__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf)(__m256) (A), (int) (I), \
+ (__v8hi)(__m128i) (W), (__mmask8) (U)))
+
+#define _mm256_maskz_cvtps_ph(U, A, I) \
+ ((__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf)(__m256) (A), (int) (I), \
+ (__v8hi)(__m128i) _mm_setzero_si128 (), (__mmask8) (U)))
+
+#define _mm256_mask_srai_epi32(W, U, A, B) \
+ ((__m256i) __builtin_ia32_psradi256_mask ((__v8si)(__m256i)(A), \
+ (int)(B), (__v8si)(__m256i)(W), (__mmask8)(U)))
+
+#define _mm256_maskz_srai_epi32(U, A, B) \
+ ((__m256i) __builtin_ia32_psradi256_mask ((__v8si)(__m256i)(A), \
+ (int)(B), (__v8si)_mm256_setzero_si256 (), (__mmask8)(U)))
+
+#define _mm_mask_srai_epi32(W, U, A, B) \
+ ((__m128i) __builtin_ia32_psradi128_mask ((__v4si)(__m128i)(A), \
+ (int)(B), (__v4si)(__m128i)(W), (__mmask8)(U)))
+
+#define _mm_maskz_srai_epi32(U, A, B) \
+ ((__m128i) __builtin_ia32_psradi128_mask ((__v4si)(__m128i)(A), \
+ (int)(B), (__v4si)_mm_setzero_si128 (), (__mmask8)(U)))
+
+#define _mm256_srai_epi64(A, B) \
+ ((__m256i) __builtin_ia32_psraqi256_mask ((__v4di)(__m256i)(A), \
+ (int)(B), (__v4di)_mm256_setzero_si256 (), (__mmask8)-1))
+
+#define _mm256_mask_srai_epi64(W, U, A, B) \
+ ((__m256i) __builtin_ia32_psraqi256_mask ((__v4di)(__m256i)(A), \
+ (int)(B), (__v4di)(__m256i)(W), (__mmask8)(U)))
+
+#define _mm256_maskz_srai_epi64(U, A, B) \
+ ((__m256i) __builtin_ia32_psraqi256_mask ((__v4di)(__m256i)(A), \
+ (int)(B), (__v4di)_mm256_setzero_si256 (), (__mmask8)(U)))
+
+#define _mm_srai_epi64(A, B) \
+ ((__m128i) __builtin_ia32_psraqi128_mask ((__v2di)(__m128i)(A), \
+ (int)(B), (__v2di)_mm_setzero_si128 (), (__mmask8)-1))
+
+#define _mm_mask_srai_epi64(W, U, A, B) \
+ ((__m128i) __builtin_ia32_psraqi128_mask ((__v2di)(__m128i)(A), \
+ (int)(B), (__v2di)(__m128i)(W), (__mmask8)(U)))
+
+#define _mm_maskz_srai_epi64(U, A, B) \
+ ((__m128i) __builtin_ia32_psraqi128_mask ((__v2di)(__m128i)(A), \
+ (int)(B), (__v2di)_mm_setzero_si128 (), (__mmask8)(U)))
+
+#define _mm256_mask_permutex_pd(W, U, A, B) \
+ ((__m256d) __builtin_ia32_permdf256_mask ((__v4df)(__m256d)(A), \
+ (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U)))
+
+#define _mm256_maskz_permutex_pd(U, A, B) \
+ ((__m256d) __builtin_ia32_permdf256_mask ((__v4df)(__m256d)(A), \
+ (int)(B), (__v4df)(__m256d)_mm256_setzero_pd (), (__mmask8)(U)))
+
+#define _mm256_mask_permute_pd(W, U, X, C) \
+ ((__m256d) __builtin_ia32_vpermilpd256_mask ((__v4df)(__m256d)(X), (int)(C), \
+ (__v4df)(__m256d)(W), \
+ (__mmask8)(U)))
+
+#define _mm256_maskz_permute_pd(U, X, C) \
+ ((__m256d) __builtin_ia32_vpermilpd256_mask ((__v4df)(__m256d)(X), (int)(C), \
+ (__v4df)(__m256d)_mm256_setzero_pd (),\
+ (__mmask8)(U)))
+
+#define _mm256_mask_permute_ps(W, U, X, C) \
+ ((__m256) __builtin_ia32_vpermilps256_mask ((__v8sf)(__m256)(X), (int)(C), \
+ (__v8sf)(__m256)(W), (__mmask8)(U)))
+
+#define _mm256_maskz_permute_ps(U, X, C) \
+ ((__m256) __builtin_ia32_vpermilps256_mask ((__v8sf)(__m256)(X), (int)(C), \
+ (__v8sf)(__m256)_mm256_setzero_ps (), \
+ (__mmask8)(U)))
+
+#define _mm_mask_permute_pd(W, U, X, C) \
+ ((__m128d) __builtin_ia32_vpermilpd_mask ((__v2df)(__m128d)(X), (int)(C), \
+ (__v2df)(__m128d)(W), (__mmask8)(U)))
+
+#define _mm_maskz_permute_pd(U, X, C) \
+ ((__m128d) __builtin_ia32_vpermilpd_mask ((__v2df)(__m128d)(X), (int)(C), \
+ (__v2df)(__m128d)_mm_setzero_pd (), \
+ (__mmask8)(U)))
+
+#define _mm_mask_permute_ps(W, U, X, C) \
+ ((__m128) __builtin_ia32_vpermilps_mask ((__v4sf)(__m128)(X), (int)(C), \
+ (__v4sf)(__m128)(W), (__mmask8)(U)))
+
+#define _mm_maskz_permute_ps(U, X, C) \
+ ((__m128) __builtin_ia32_vpermilps_mask ((__v4sf)(__m128)(X), (int)(C), \
+ (__v4sf)(__m128)_mm_setzero_ps (), \
+ (__mmask8)(U)))
+
+#define _mm256_mask_blend_pd(__U, __A, __W) \
+ ((__m256d) __builtin_ia32_blendmpd_256_mask ((__v4df) (__A), \
+ (__v4df) (__W), \
+ (__mmask8) (__U)))
+
+#define _mm256_mask_blend_ps(__U, __A, __W) \
+ ((__m256) __builtin_ia32_blendmps_256_mask ((__v8sf) (__A), \
+ (__v8sf) (__W), \
+ (__mmask8) (__U)))
+
+#define _mm256_mask_blend_epi64(__U, __A, __W) \
+ ((__m256i) __builtin_ia32_blendmq_256_mask ((__v4di) (__A), \
+ (__v4di) (__W), \
+ (__mmask8) (__U)))
+
+#define _mm256_mask_blend_epi32(__U, __A, __W) \
+ ((__m256i) __builtin_ia32_blendmd_256_mask ((__v8si) (__A), \
+ (__v8si) (__W), \
+ (__mmask8) (__U)))
+
+#define _mm_mask_blend_pd(__U, __A, __W) \
+ ((__m128d) __builtin_ia32_blendmpd_128_mask ((__v2df) (__A), \
+ (__v2df) (__W), \
+ (__mmask8) (__U)))
+
+#define _mm_mask_blend_ps(__U, __A, __W) \
+ ((__m128) __builtin_ia32_blendmps_128_mask ((__v4sf) (__A), \
+ (__v4sf) (__W), \
+ (__mmask8) (__U)))
+
+#define _mm_mask_blend_epi64(__U, __A, __W) \
+ ((__m128i) __builtin_ia32_blendmq_128_mask ((__v2di) (__A), \
+ (__v2di) (__W), \
+ (__mmask8) (__U)))
+
+#define _mm_mask_blend_epi32(__U, __A, __W) \
+ ((__m128i) __builtin_ia32_blendmd_128_mask ((__v4si) (__A), \
+ (__v4si) (__W), \
+ (__mmask8) (__U)))
+
+#define _mm256_cmp_epu32_mask(X, Y, P) \
+ ((__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si)(__m256i)(X), \
+ (__v8si)(__m256i)(Y), (int)(P),\
+ (__mmask8)-1))
+
+#define _mm256_cmp_epi64_mask(X, Y, P) \
+ ((__mmask8) __builtin_ia32_cmpq256_mask ((__v4di)(__m256i)(X), \
+ (__v4di)(__m256i)(Y), (int)(P),\
+ (__mmask8)-1))
+
+#define _mm256_cmp_epi32_mask(X, Y, P) \
+ ((__mmask8) __builtin_ia32_cmpd256_mask ((__v8si)(__m256i)(X), \
+ (__v8si)(__m256i)(Y), (int)(P),\
+ (__mmask8)-1))
+
+#define _mm256_cmp_epu64_mask(X, Y, P) \
+ ((__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di)(__m256i)(X), \
+ (__v4di)(__m256i)(Y), (int)(P),\
+ (__mmask8)-1))
+
+#define _mm256_cmp_pd_mask(X, Y, P) \
+ ((__mmask8) __builtin_ia32_cmppd256_mask ((__v4df)(__m256d)(X), \
+ (__v4df)(__m256d)(Y), (int)(P),\
+ (__mmask8)-1))
+
+#define _mm256_cmp_ps_mask(X, Y, P) \
+ ((__mmask8) __builtin_ia32_cmpps256_mask ((__v8sf)(__m256)(X), \
+ (__v8sf)(__m256)(Y), (int)(P),\
+ (__mmask8)-1))
+
+#define _mm256_mask_cmp_epi64_mask(M, X, Y, P) \
+ ((__mmask8) __builtin_ia32_cmpq256_mask ((__v4di)(__m256i)(X), \
+ (__v4di)(__m256i)(Y), (int)(P),\
+ (__mmask8)(M)))
+
+#define _mm256_mask_cmp_epi32_mask(M, X, Y, P) \
+ ((__mmask8) __builtin_ia32_cmpd256_mask ((__v8si)(__m256i)(X), \
+ (__v8si)(__m256i)(Y), (int)(P),\
+ (__mmask8)(M)))
+
+#define _mm256_mask_cmp_epu64_mask(M, X, Y, P) \
+ ((__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di)(__m256i)(X), \
+ (__v4di)(__m256i)(Y), (int)(P),\
+ (__mmask8)(M)))
+
+#define _mm256_mask_cmp_epu32_mask(M, X, Y, P) \
+ ((__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si)(__m256i)(X), \
+ (__v8si)(__m256i)(Y), (int)(P),\
+ (__mmask8)(M)))
+
+#define _mm256_mask_cmp_pd_mask(M, X, Y, P) \
+ ((__mmask8) __builtin_ia32_cmppd256_mask ((__v4df)(__m256d)(X), \
+ (__v4df)(__m256d)(Y), (int)(P),\
+ (__mmask8)(M)))
+
+#define _mm256_mask_cmp_ps_mask(M, X, Y, P) \
+ ((__mmask8) __builtin_ia32_cmpps256_mask ((__v8sf)(__m256)(X), \
+ (__v8sf)(__m256)(Y), (int)(P),\
+ (__mmask8)(M)))
+
+#define _mm_cmp_epi64_mask(X, Y, P) \
+ ((__mmask8) __builtin_ia32_cmpq128_mask ((__v2di)(__m128i)(X), \
+ (__v2di)(__m128i)(Y), (int)(P),\
+ (__mmask8)-1))
+
+#define _mm_cmp_epi32_mask(X, Y, P) \
+ ((__mmask8) __builtin_ia32_cmpd128_mask ((__v4si)(__m128i)(X), \
+ (__v4si)(__m128i)(Y), (int)(P),\
+ (__mmask8)-1))
+
+#define _mm_cmp_epu64_mask(X, Y, P) \
+ ((__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di)(__m128i)(X), \
+ (__v2di)(__m128i)(Y), (int)(P),\
+ (__mmask8)-1))
+
+#define _mm_cmp_epu32_mask(X, Y, P) \
+ ((__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si)(__m128i)(X), \
+ (__v4si)(__m128i)(Y), (int)(P),\
+ (__mmask8)-1))
+
+#define _mm_cmp_pd_mask(X, Y, P) \
+ ((__mmask8) __builtin_ia32_cmppd128_mask ((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), (int)(P),\
+ (__mmask8)-1))
+
+#define _mm_cmp_ps_mask(X, Y, P) \
+ ((__mmask8) __builtin_ia32_cmpps128_mask ((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), (int)(P),\
+ (__mmask8)-1))
+
+#define _mm_mask_cmp_epi64_mask(M, X, Y, P) \
+ ((__mmask8) __builtin_ia32_cmpq128_mask ((__v2di)(__m128i)(X), \
+ (__v2di)(__m128i)(Y), (int)(P),\
+ (__mmask8)(M)))
+
+#define _mm_mask_cmp_epi32_mask(M, X, Y, P) \
+ ((__mmask8) __builtin_ia32_cmpd128_mask ((__v4si)(__m128i)(X), \
+ (__v4si)(__m128i)(Y), (int)(P),\
+ (__mmask8)(M)))
+
+#define _mm_mask_cmp_epu64_mask(M, X, Y, P) \
+ ((__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di)(__m128i)(X), \
+ (__v2di)(__m128i)(Y), (int)(P),\
+ (__mmask8)(M)))
+
+#define _mm_mask_cmp_epu32_mask(M, X, Y, P) \
+ ((__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si)(__m128i)(X), \
+ (__v4si)(__m128i)(Y), (int)(P),\
+ (__mmask8)(M)))
+
+#define _mm_mask_cmp_pd_mask(M, X, Y, P) \
+ ((__mmask8) __builtin_ia32_cmppd128_mask ((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), (int)(P),\
+ (__mmask8)(M)))
+
+#define _mm_mask_cmp_ps_mask(M, X, Y, P) \
+ ((__mmask8) __builtin_ia32_cmpps128_mask ((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), (int)(P),\
+ (__mmask8)(M)))
+
+#endif
+
+#define _mm256_permutexvar_ps(A, B) _mm256_permutevar8x32_ps ((B), (A))
+#define _mm256_mask_cvt_roundps_ph(A, B, C, D) \
+ _mm256_mask_cvtps_ph ((A), (B), (C), (D))
+#define _mm256_maskz_cvt_roundps_ph(A, B, C) \
+ _mm256_maskz_cvtps_ph ((A), (B), (C))
+#define _mm_mask_cvt_roundps_ph(A, B, C, D) \
+ _mm_mask_cvtps_ph ((A), (B), (C), (D))
+#define _mm_maskz_cvt_roundps_ph(A, B, C) _mm_maskz_cvtps_ph ((A), (B), (C))
+
+#ifdef __DISABLE_AVX512VL__
+#undef __DISABLE_AVX512VL__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512VL__ */
+
+#endif /* _AVX512VLINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2013-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512vnniintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VNNIINTRIN_H_INCLUDED
+#define __AVX512VNNIINTRIN_H_INCLUDED
+
+#if !defined(__AVX512VNNI__)
+#pragma GCC push_options
+#pragma GCC target("avx512vnni")
+#define __DISABLE_AVX512VNNI__
+#endif /* __AVX512VNNI__ */
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_dpbusd_epi32 (__m512i __A, __m512i __B, __m512i __C)
+{
+ return (__m512i) __builtin_ia32_vpdpbusd_v16si ((__v16si)__A, (__v16si) __B,
+ (__v16si) __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_dpbusd_epi32 (__m512i __A, __mmask16 __B, __m512i __C, __m512i __D)
+{
+ return (__m512i)__builtin_ia32_vpdpbusd_v16si_mask ((__v16si)__A,
+ (__v16si) __C, (__v16si) __D, (__mmask16)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_dpbusd_epi32 (__mmask16 __A, __m512i __B, __m512i __C,
+ __m512i __D)
+{
+ return (__m512i)__builtin_ia32_vpdpbusd_v16si_maskz ((__v16si)__B,
+ (__v16si) __C, (__v16si) __D, (__mmask16)__A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_dpbusds_epi32 (__m512i __A, __m512i __B, __m512i __C)
+{
+ return (__m512i) __builtin_ia32_vpdpbusds_v16si ((__v16si)__A, (__v16si) __B,
+ (__v16si) __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_dpbusds_epi32 (__m512i __A, __mmask16 __B, __m512i __C,
+ __m512i __D)
+{
+ return (__m512i)__builtin_ia32_vpdpbusds_v16si_mask ((__v16si)__A,
+ (__v16si) __C, (__v16si) __D, (__mmask16)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_dpbusds_epi32 (__mmask16 __A, __m512i __B, __m512i __C,
+ __m512i __D)
+{
+ return (__m512i)__builtin_ia32_vpdpbusds_v16si_maskz ((__v16si)__B,
+ (__v16si) __C, (__v16si) __D, (__mmask16)__A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_dpwssd_epi32 (__m512i __A, __m512i __B, __m512i __C)
+{
+ return (__m512i) __builtin_ia32_vpdpwssd_v16si ((__v16si)__A, (__v16si) __B,
+ (__v16si) __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_dpwssd_epi32 (__m512i __A, __mmask16 __B, __m512i __C, __m512i __D)
+{
+ return (__m512i)__builtin_ia32_vpdpwssd_v16si_mask ((__v16si)__A,
+ (__v16si) __C, (__v16si) __D, (__mmask16)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_dpwssd_epi32 (__mmask16 __A, __m512i __B, __m512i __C,
+ __m512i __D)
+{
+ return (__m512i)__builtin_ia32_vpdpwssd_v16si_maskz ((__v16si)__B,
+ (__v16si) __C, (__v16si) __D, (__mmask16)__A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_dpwssds_epi32 (__m512i __A, __m512i __B, __m512i __C)
+{
+ return (__m512i) __builtin_ia32_vpdpwssds_v16si ((__v16si)__A, (__v16si) __B,
+ (__v16si) __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_dpwssds_epi32 (__m512i __A, __mmask16 __B, __m512i __C,
+ __m512i __D)
+{
+ return (__m512i)__builtin_ia32_vpdpwssds_v16si_mask ((__v16si)__A,
+ (__v16si) __C, (__v16si) __D, (__mmask16)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_dpwssds_epi32 (__mmask16 __A, __m512i __B, __m512i __C,
+ __m512i __D)
+{
+ return (__m512i)__builtin_ia32_vpdpwssds_v16si_maskz ((__v16si)__B,
+ (__v16si) __C, (__v16si) __D, (__mmask16)__A);
+}
+
+#ifdef __DISABLE_AVX512VNNI__
+#undef __DISABLE_AVX512VNNI__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512VNNI__ */
+
+#endif /* __AVX512VNNIINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2013-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512vnnivlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512VNNIVLINTRIN_H_INCLUDED
+#define _AVX512VNNIVLINTRIN_H_INCLUDED
+
+#if !defined(__AVX512VL__) || !defined(__AVX512VNNI__)
+#pragma GCC push_options
+#pragma GCC target("avx512vnni,avx512vl")
+#define __DISABLE_AVX512VNNIVL__
+#endif /* __AVX512VNNIVL__ */
+
+#define _mm256_dpbusd_epi32(A, B, C) \
+ ((__m256i) __builtin_ia32_vpdpbusd_v8si ((__v8si) (A), \
+ (__v8si) (B), \
+ (__v8si) (C)))
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_dpbusd_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D)
+{
+ return (__m256i)__builtin_ia32_vpdpbusd_v8si_mask ((__v8si)__A, (__v8si) __C,
+ (__v8si) __D, (__mmask8)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_dpbusd_epi32 (__mmask8 __A, __m256i __B, __m256i __C, __m256i __D)
+{
+ return (__m256i)__builtin_ia32_vpdpbusd_v8si_maskz ((__v8si)__B,
+ (__v8si) __C, (__v8si) __D, (__mmask8)__A);
+}
+
+#define _mm_dpbusd_epi32(A, B, C) \
+ ((__m128i) __builtin_ia32_vpdpbusd_v4si ((__v4si) (A), \
+ (__v4si) (B), \
+ (__v4si) (C)))
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_dpbusd_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D)
+{
+ return (__m128i)__builtin_ia32_vpdpbusd_v4si_mask ((__v4si)__A, (__v4si) __C,
+ (__v4si) __D, (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_dpbusd_epi32 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D)
+{
+ return (__m128i)__builtin_ia32_vpdpbusd_v4si_maskz ((__v4si)__B,
+ (__v4si) __C, (__v4si) __D, (__mmask8)__A);
+}
+
+#define _mm256_dpbusds_epi32(A, B, C) \
+ ((__m256i) __builtin_ia32_vpdpbusds_v8si ((__v8si) (A), \
+ (__v8si) (B), \
+ (__v8si) (C)))
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_dpbusds_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D)
+{
+ return (__m256i)__builtin_ia32_vpdpbusds_v8si_mask ((__v8si)__A,
+ (__v8si) __C, (__v8si) __D, (__mmask8)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_dpbusds_epi32 (__mmask8 __A, __m256i __B, __m256i __C,
+ __m256i __D)
+{
+ return (__m256i)__builtin_ia32_vpdpbusds_v8si_maskz ((__v8si)__B,
+ (__v8si) __C, (__v8si) __D, (__mmask8)__A);
+}
+
+#define _mm_dpbusds_epi32(A, B, C) \
+ ((__m128i) __builtin_ia32_vpdpbusds_v4si ((__v4si) (A), \
+ (__v4si) (B), \
+ (__v4si) (C)))
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_dpbusds_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D)
+{
+ return (__m128i)__builtin_ia32_vpdpbusds_v4si_mask ((__v4si)__A,
+ (__v4si) __C, (__v4si) __D, (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_dpbusds_epi32 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D)
+{
+ return (__m128i)__builtin_ia32_vpdpbusds_v4si_maskz ((__v4si)__B,
+ (__v4si) __C, (__v4si) __D, (__mmask8)__A);
+}
+
+#define _mm256_dpwssd_epi32(A, B, C) \
+ ((__m256i) __builtin_ia32_vpdpwssd_v8si ((__v8si) (A), \
+ (__v8si) (B), \
+ (__v8si) (C)))
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_dpwssd_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D)
+{
+ return (__m256i)__builtin_ia32_vpdpwssd_v8si_mask ((__v8si)__A, (__v8si) __C,
+ (__v8si) __D, (__mmask8)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_dpwssd_epi32 (__mmask8 __A, __m256i __B, __m256i __C, __m256i __D)
+{
+ return (__m256i)__builtin_ia32_vpdpwssd_v8si_maskz ((__v8si)__B,
+ (__v8si) __C, (__v8si) __D, (__mmask8)__A);
+}
+
+#define _mm_dpwssd_epi32(A, B, C) \
+ ((__m128i) __builtin_ia32_vpdpwssd_v4si ((__v4si) (A), \
+ (__v4si) (B), \
+ (__v4si) (C)))
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_dpwssd_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D)
+{
+ return (__m128i)__builtin_ia32_vpdpwssd_v4si_mask ((__v4si)__A, (__v4si) __C,
+ (__v4si) __D, (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_dpwssd_epi32 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D)
+{
+ return (__m128i)__builtin_ia32_vpdpwssd_v4si_maskz ((__v4si)__B,
+ (__v4si) __C, (__v4si) __D, (__mmask8)__A);
+}
+
+#define _mm256_dpwssds_epi32(A, B, C) \
+ ((__m256i) __builtin_ia32_vpdpwssds_v8si ((__v8si) (A), \
+ (__v8si) (B), \
+ (__v8si) (C)))
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_dpwssds_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D)
+{
+ return (__m256i)__builtin_ia32_vpdpwssds_v8si_mask ((__v8si)__A,
+ (__v8si) __C, (__v8si) __D, (__mmask8)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_dpwssds_epi32 (__mmask8 __A, __m256i __B, __m256i __C,
+ __m256i __D)
+{
+ return (__m256i)__builtin_ia32_vpdpwssds_v8si_maskz ((__v8si)__B,
+ (__v8si) __C, (__v8si) __D, (__mmask8)__A);
+}
+
+#define _mm_dpwssds_epi32(A, B, C) \
+ ((__m128i) __builtin_ia32_vpdpwssds_v4si ((__v4si) (A), \
+ (__v4si) (B), \
+ (__v4si) (C)))
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_dpwssds_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D)
+{
+ return (__m128i)__builtin_ia32_vpdpwssds_v4si_mask ((__v4si)__A,
+ (__v4si) __C, (__v4si) __D, (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_dpwssds_epi32 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D)
+{
+ return (__m128i)__builtin_ia32_vpdpwssds_v4si_maskz ((__v4si)__B,
+ (__v4si) __C, (__v4si) __D, (__mmask8)__A);
+}
+#ifdef __DISABLE_AVX512VNNIVL__
+#undef __DISABLE_AVX512VNNIVL__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512VNNIVL__ */
+#endif /* __DISABLE_AVX512VNNIVL__ */
--- /dev/null
+/* Copyright (C) 2019-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512vp2intersectintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512VP2INTERSECTINTRIN_H_INCLUDED
+#define _AVX512VP2INTERSECTINTRIN_H_INCLUDED
+
+#if !defined(__AVX512VP2INTERSECT__)
+#pragma GCC push_options
+#pragma GCC target("avx512vp2intersect")
+#define __DISABLE_AVX512VP2INTERSECT__
+#endif /* __AVX512VP2INTERSECT__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_2intersect_epi32 (__m512i __A, __m512i __B, __mmask16 *__U,
+ __mmask16 *__M)
+{
+ __builtin_ia32_2intersectd512 (__U, __M, (__v16si) __A, (__v16si) __B);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_2intersect_epi64 (__m512i __A, __m512i __B, __mmask8 *__U,
+ __mmask8 *__M)
+{
+ __builtin_ia32_2intersectq512 (__U, __M, (__v8di) __A, (__v8di) __B);
+}
+
+#ifdef __DISABLE_AVX512VP2INTERSECT__
+#undef __DISABLE_AVX512VP2INTERSECT__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512VP2INTERSECT__ */
+
+#endif /* _AVX512VP2INTERSECTINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2019-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512vp2intersectintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512VP2INTERSECTVLINTRIN_H_INCLUDED
+#define _AVX512VP2INTERSECTVLINTRIN_H_INCLUDED
+
+#if !defined(__AVX512VP2INTERSECT__) || !defined(__AVX512VL__)
+#pragma GCC push_options
+#pragma GCC target("avx512vp2intersect,avx512vl")
+#define __DISABLE_AVX512VP2INTERSECTVL__
+#endif /* __AVX512VP2INTERSECTVL__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_2intersect_epi32 (__m128i __A, __m128i __B, __mmask8 *__U, __mmask8 *__M)
+{
+ __builtin_ia32_2intersectd128 (__U, __M, (__v4si) __A, (__v4si) __B);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_2intersect_epi32 (__m256i __A, __m256i __B, __mmask8 *__U,
+ __mmask8 *__M)
+{
+ __builtin_ia32_2intersectd256 (__U, __M, (__v8si) __A, (__v8si) __B);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_2intersect_epi64 (__m128i __A, __m128i __B, __mmask8 *__U, __mmask8 *__M)
+{
+ __builtin_ia32_2intersectq128 (__U, __M, (__v2di) __A, (__v2di) __B);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_2intersect_epi64 (__m256i __A, __m256i __B, __mmask8 *__U,
+ __mmask8 *__M)
+{
+ __builtin_ia32_2intersectq256 (__U, __M, (__v4di) __A, (__v4di) __B);
+}
+
+#ifdef __DISABLE_AVX512VP2INTERSECTVL__
+#undef __DISABLE_AVX512VP2INTERSECTVL__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512VP2INTERSECTVL__ */
+
+#endif /* _AVX512VP2INTERSECTVLINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2017-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+# error "Never use <avx512vpopcntdqintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _AVX512VPOPCNTDQINTRIN_H_INCLUDED
+#define _AVX512VPOPCNTDQINTRIN_H_INCLUDED
+
+#ifndef __AVX512VPOPCNTDQ__
+#pragma GCC push_options
+#pragma GCC target("avx512vpopcntdq")
+#define __DISABLE_AVX512VPOPCNTDQ__
+#endif /* __AVX512VPOPCNTDQ__ */
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_popcnt_epi32 (__m512i __A)
+{
+ return (__m512i) __builtin_ia32_vpopcountd_v16si ((__v16si) __A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_popcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_vpopcountd_v16si_mask ((__v16si) __A,
+ (__v16si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_popcnt_epi32 (__mmask16 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_vpopcountd_v16si_mask ((__v16si) __A,
+ (__v16si)
+ _mm512_setzero_si512 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_popcnt_epi64 (__m512i __A)
+{
+ return (__m512i) __builtin_ia32_vpopcountq_v8di ((__v8di) __A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_popcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_vpopcountq_v8di_mask ((__v8di) __A,
+ (__v8di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_popcnt_epi64 (__mmask8 __U, __m512i __A)
+{
+ return (__m512i) __builtin_ia32_vpopcountq_v8di_mask ((__v8di) __A,
+ (__v8di)
+ _mm512_setzero_si512 (),
+ (__mmask8) __U);
+}
+
+#ifdef __DISABLE_AVX512VPOPCNTDQ__
+#undef __DISABLE_AVX512VPOPCNTDQ__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512VPOPCNTDQ__ */
+
+#endif /* _AVX512VPOPCNTDQINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2017-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+# error "Never use <avx512vpopcntdqvlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512VPOPCNTDQVLINTRIN_H_INCLUDED
+#define _AVX512VPOPCNTDQVLINTRIN_H_INCLUDED
+
+#if !defined(__AVX512VPOPCNTDQ__) || !defined(__AVX512VL__)
+#pragma GCC push_options
+#pragma GCC target("avx512vpopcntdq,avx512vl")
+#define __DISABLE_AVX512VPOPCNTDQVL__
+#endif /* __AVX512VPOPCNTDQVL__ */
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_popcnt_epi32 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_vpopcountd_v4si ((__v4si) __A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_popcnt_epi32 (__m128i __W, __mmask16 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_vpopcountd_v4si_mask ((__v4si) __A,
+ (__v4si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_popcnt_epi32 (__mmask16 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_vpopcountd_v4si_mask ((__v4si) __A,
+ (__v4si)
+ _mm_setzero_si128 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_popcnt_epi32 (__m256i __A)
+{
+ return (__m256i) __builtin_ia32_vpopcountd_v8si ((__v8si) __A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_popcnt_epi32 (__m256i __W, __mmask16 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_vpopcountd_v8si_mask ((__v8si) __A,
+ (__v8si) __W,
+ (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_popcnt_epi32 (__mmask16 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_vpopcountd_v8si_mask ((__v8si) __A,
+ (__v8si)
+ _mm256_setzero_si256 (),
+ (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_popcnt_epi64 (__m128i __A)
+{
+ return (__m128i) __builtin_ia32_vpopcountq_v2di ((__v2di) __A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_popcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_vpopcountq_v2di_mask ((__v2di) __A,
+ (__v2di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_popcnt_epi64 (__mmask8 __U, __m128i __A)
+{
+ return (__m128i) __builtin_ia32_vpopcountq_v2di_mask ((__v2di) __A,
+ (__v2di)
+ _mm_setzero_si128 (),
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_popcnt_epi64 (__m256i __A)
+{
+ return (__m256i) __builtin_ia32_vpopcountq_v4di ((__v4di) __A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_popcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_vpopcountq_v4di_mask ((__v4di) __A,
+ (__v4di) __W,
+ (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_popcnt_epi64 (__mmask8 __U, __m256i __A)
+{
+ return (__m256i) __builtin_ia32_vpopcountq_v4di_mask ((__v4di) __A,
+ (__v4di)
+ _mm256_setzero_si256 (),
+ (__mmask8) __U);
+}
+
+#ifdef __DISABLE_AVX512VPOPCNTDQVL__
+#undef __DISABLE_AVX512VPOPCNTDQVL__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512VPOPCNTDQVL__ */
+
+#endif /* _AVX512VPOPCNTDQVLINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avxifmaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVXIFMAINTRIN_H_INCLUDED
+#define _AVXIFMAINTRIN_H_INCLUDED
+
+#ifndef __AVXIFMA__
+#pragma GCC push_options
+#pragma GCC target("avxifma")
+#define __DISABLE_AVXIFMA__
+#endif /* __AVXIFMA__ */
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_madd52lo_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
+{
+ return (__m128i) __builtin_ia32_vpmadd52luq128 ((__v2di) __X,
+ (__v2di) __Y,
+ (__v2di) __Z);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_madd52hi_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
+{
+ return (__m128i) __builtin_ia32_vpmadd52huq128 ((__v2di) __X,
+ (__v2di) __Y,
+ (__v2di) __Z);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_madd52lo_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
+{
+ return (__m256i) __builtin_ia32_vpmadd52luq256 ((__v4di) __X,
+ (__v4di) __Y,
+ (__v4di) __Z);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_madd52hi_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
+{
+ return (__m256i) __builtin_ia32_vpmadd52huq256 ((__v4di) __X,
+ (__v4di) __Y,
+ (__v4di) __Z);
+}
+
+#ifdef __DISABLE_AVXIFMA__
+#undef __DISABLE_AVXIFMA__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVXIFMA__ */
+
+#endif /* _AVXIFMAINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2008-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+ User Guide and Reference, version 11.0. */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+# error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVXINTRIN_H_INCLUDED
+#define _AVXINTRIN_H_INCLUDED
+
+#ifndef __AVX__
+#pragma GCC push_options
+#pragma GCC target("avx")
+#define __DISABLE_AVX__
+#endif /* __AVX__ */
+
+/* Internal data types for implementing the intrinsics. */
+typedef double __v4df __attribute__ ((__vector_size__ (32)));
+typedef float __v8sf __attribute__ ((__vector_size__ (32)));
+typedef long long __v4di __attribute__ ((__vector_size__ (32)));
+typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
+typedef int __v8si __attribute__ ((__vector_size__ (32)));
+typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
+typedef short __v16hi __attribute__ ((__vector_size__ (32)));
+typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
+typedef char __v32qi __attribute__ ((__vector_size__ (32)));
+typedef signed char __v32qs __attribute__ ((__vector_size__ (32)));
+typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+ vector types, and their scalar components. */
+typedef float __m256 __attribute__ ((__vector_size__ (32),
+ __may_alias__));
+typedef long long __m256i __attribute__ ((__vector_size__ (32),
+ __may_alias__));
+typedef double __m256d __attribute__ ((__vector_size__ (32),
+ __may_alias__));
+
+/* Unaligned version of the same types. */
+typedef float __m256_u __attribute__ ((__vector_size__ (32),
+ __may_alias__,
+ __aligned__ (1)));
+typedef long long __m256i_u __attribute__ ((__vector_size__ (32),
+ __may_alias__,
+ __aligned__ (1)));
+typedef double __m256d_u __attribute__ ((__vector_size__ (32),
+ __may_alias__,
+ __aligned__ (1)));
+
+/* Compare predicates for scalar and packed compare intrinsics. */
+
+/* Equal (ordered, non-signaling) */
+#define _CMP_EQ_OQ 0x00
+/* Less-than (ordered, signaling) */
+#define _CMP_LT_OS 0x01
+/* Less-than-or-equal (ordered, signaling) */
+#define _CMP_LE_OS 0x02
+/* Unordered (non-signaling) */
+#define _CMP_UNORD_Q 0x03
+/* Not-equal (unordered, non-signaling) */
+#define _CMP_NEQ_UQ 0x04
+/* Not-less-than (unordered, signaling) */
+#define _CMP_NLT_US 0x05
+/* Not-less-than-or-equal (unordered, signaling) */
+#define _CMP_NLE_US 0x06
+/* Ordered (nonsignaling) */
+#define _CMP_ORD_Q 0x07
+/* Equal (unordered, non-signaling) */
+#define _CMP_EQ_UQ 0x08
+/* Not-greater-than-or-equal (unordered, signaling) */
+#define _CMP_NGE_US 0x09
+/* Not-greater-than (unordered, signaling) */
+#define _CMP_NGT_US 0x0a
+/* False (ordered, non-signaling) */
+#define _CMP_FALSE_OQ 0x0b
+/* Not-equal (ordered, non-signaling) */
+#define _CMP_NEQ_OQ 0x0c
+/* Greater-than-or-equal (ordered, signaling) */
+#define _CMP_GE_OS 0x0d
+/* Greater-than (ordered, signaling) */
+#define _CMP_GT_OS 0x0e
+/* True (unordered, non-signaling) */
+#define _CMP_TRUE_UQ 0x0f
+/* Equal (ordered, signaling) */
+#define _CMP_EQ_OS 0x10
+/* Less-than (ordered, non-signaling) */
+#define _CMP_LT_OQ 0x11
+/* Less-than-or-equal (ordered, non-signaling) */
+#define _CMP_LE_OQ 0x12
+/* Unordered (signaling) */
+#define _CMP_UNORD_S 0x13
+/* Not-equal (unordered, signaling) */
+#define _CMP_NEQ_US 0x14
+/* Not-less-than (unordered, non-signaling) */
+#define _CMP_NLT_UQ 0x15
+/* Not-less-than-or-equal (unordered, non-signaling) */
+#define _CMP_NLE_UQ 0x16
+/* Ordered (signaling) */
+#define _CMP_ORD_S 0x17
+/* Equal (unordered, signaling) */
+#define _CMP_EQ_US 0x18
+/* Not-greater-than-or-equal (unordered, non-signaling) */
+#define _CMP_NGE_UQ 0x19
+/* Not-greater-than (unordered, non-signaling) */
+#define _CMP_NGT_UQ 0x1a
+/* False (ordered, signaling) */
+#define _CMP_FALSE_OS 0x1b
+/* Not-equal (ordered, signaling) */
+#define _CMP_NEQ_OS 0x1c
+/* Greater-than-or-equal (ordered, non-signaling) */
+#define _CMP_GE_OQ 0x1d
+/* Greater-than (ordered, non-signaling) */
+#define _CMP_GT_OQ 0x1e
+/* True (unordered, signaling) */
+#define _CMP_TRUE_US 0x1f
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_add_pd (__m256d __A, __m256d __B)
+{
+ return (__m256d) ((__v4df)__A + (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_add_ps (__m256 __A, __m256 __B)
+{
+ return (__m256) ((__v8sf)__A + (__v8sf)__B);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_addsub_pd (__m256d __A, __m256d __B)
+{
+ return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_addsub_ps (__m256 __A, __m256 __B)
+{
+ return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_and_pd (__m256d __A, __m256d __B)
+{
+ return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_and_ps (__m256 __A, __m256 __B)
+{
+ return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_andnot_pd (__m256d __A, __m256d __B)
+{
+ return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_andnot_ps (__m256 __A, __m256 __B)
+{
+ return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+/* Double/single precision floating point blend instructions - select
+ data from 2 sources using constant/variable mask. */
+
+#ifdef __OPTIMIZE__
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_blend_pd (__m256d __X, __m256d __Y, const int __M)
+{
+ return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X,
+ (__v4df)__Y,
+ __M);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_blend_ps (__m256 __X, __m256 __Y, const int __M)
+{
+ return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X,
+ (__v8sf)__Y,
+ __M);
+}
+#else
+#define _mm256_blend_pd(X, Y, M) \
+ ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X), \
+ (__v4df)(__m256d)(Y), (int)(M)))
+
+#define _mm256_blend_ps(X, Y, M) \
+ ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X), \
+ (__v8sf)(__m256)(Y), (int)(M)))
+#endif
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M)
+{
+ return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X,
+ (__v4df)__Y,
+ (__v4df)__M);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M)
+{
+ return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X,
+ (__v8sf)__Y,
+ (__v8sf)__M);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_div_pd (__m256d __A, __m256d __B)
+{
+ return (__m256d) ((__v4df)__A / (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_div_ps (__m256 __A, __m256 __B)
+{
+ return (__m256) ((__v8sf)__A / (__v8sf)__B);
+}
+
+/* Dot product instructions with mask-defined summing and zeroing parts
+ of result. */
+
+#ifdef __OPTIMIZE__
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_dp_ps (__m256 __X, __m256 __Y, const int __M)
+{
+ return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X,
+ (__v8sf)__Y,
+ __M);
+}
+#else
+#define _mm256_dp_ps(X, Y, M) \
+ ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X), \
+ (__v8sf)(__m256)(Y), (int)(M)))
+#endif
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hadd_pd (__m256d __X, __m256d __Y)
+{
+ return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hadd_ps (__m256 __X, __m256 __Y)
+{
+ return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hsub_pd (__m256d __X, __m256d __Y)
+{
+ return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hsub_ps (__m256 __X, __m256 __Y)
+{
+ return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_pd (__m256d __A, __m256d __B)
+{
+ return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_ps (__m256 __A, __m256 __B)
+{
+ return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_pd (__m256d __A, __m256d __B)
+{
+ return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_ps (__m256 __A, __m256 __B)
+{
+ return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mul_pd (__m256d __A, __m256d __B)
+{
+ return (__m256d) ((__v4df)__A * (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mul_ps (__m256 __A, __m256 __B)
+{
+ return (__m256) ((__v8sf)__A * (__v8sf)__B);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_or_pd (__m256d __A, __m256d __B)
+{
+ return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_or_ps (__m256 __A, __m256 __B)
+{
+ return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask)
+{
+ return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B,
+ __mask);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask)
+{
+ return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B,
+ __mask);
+}
+#else
+#define _mm256_shuffle_pd(A, B, N) \
+ ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A), \
+ (__v4df)(__m256d)(B), (int)(N)))
+
+#define _mm256_shuffle_ps(A, B, N) \
+ ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A), \
+ (__v8sf)(__m256)(B), (int)(N)))
+#endif
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sub_pd (__m256d __A, __m256d __B)
+{
+ return (__m256d) ((__v4df)__A - (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sub_ps (__m256 __A, __m256 __B)
+{
+ return (__m256) ((__v8sf)__A - (__v8sf)__B);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_xor_pd (__m256d __A, __m256d __B)
+{
+ return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_xor_ps (__m256 __A, __m256 __B)
+{
+ return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_pd (__m128d __X, __m128d __Y, const int __P)
+{
+ return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_ps (__m128 __X, __m128 __Y, const int __P)
+{
+ return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P)
+{
+ return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y,
+ __P);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P)
+{
+ return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y,
+ __P);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_sd (__m128d __X, __m128d __Y, const int __P)
+{
+ return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_ss (__m128 __X, __m128 __Y, const int __P)
+{
+ return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P);
+}
+#else
+#define _mm_cmp_pd(X, Y, P) \
+ ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), (int)(P)))
+
+#define _mm_cmp_ps(X, Y, P) \
+ ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), (int)(P)))
+
+#define _mm256_cmp_pd(X, Y, P) \
+ ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X), \
+ (__v4df)(__m256d)(Y), (int)(P)))
+
+#define _mm256_cmp_ps(X, Y, P) \
+ ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X), \
+ (__v8sf)(__m256)(Y), (int)(P)))
+
+#define _mm_cmp_sd(X, Y, P) \
+ ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), (int)(P)))
+
+#define _mm_cmp_ss(X, Y, P) \
+ ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), (int)(P)))
+#endif
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtsi256_si32 (__m256i __A)
+{
+ __v8si __B = (__v8si) __A;
+ return __B[0];
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi32_pd (__m128i __A)
+{
+ return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi32_ps (__m256i __A)
+{
+ return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtpd_ps (__m256d __A)
+{
+ return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtps_epi32 (__m256 __A)
+{
+ return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtps_pd (__m128 __A)
+{
+ return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttpd_epi32 (__m256d __A)
+{
+ return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtpd_epi32 (__m256d __A)
+{
+ return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttps_epi32 (__m256 __A)
+{
+ return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A);
+}
+
+extern __inline double
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtsd_f64 (__m256d __A)
+{
+ return __A[0];
+}
+
+extern __inline float
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtss_f32 (__m256 __A)
+{
+ return __A[0];
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extractf128_pd (__m256d __X, const int __N)
+{
+ return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extractf128_ps (__m256 __X, const int __N)
+{
+ return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extractf128_si256 (__m256i __X, const int __N)
+{
+ return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extract_epi32 (__m256i __X, int const __N)
+{
+ __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
+ return _mm_extract_epi32 (__Y, __N % 4);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extract_epi16 (__m256i __X, int const __N)
+{
+ __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
+ return _mm_extract_epi16 (__Y, __N % 8);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extract_epi8 (__m256i __X, int const __N)
+{
+ __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
+ return _mm_extract_epi8 (__Y, __N % 16);
+}
+
+#ifdef __x86_64__
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extract_epi64 (__m256i __X, const int __N)
+{
+ __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
+ return _mm_extract_epi64 (__Y, __N % 2);
+}
+#endif
+#else
+#define _mm256_extractf128_pd(X, N) \
+ ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X), \
+ (int)(N)))
+
+#define _mm256_extractf128_ps(X, N) \
+ ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X), \
+ (int)(N)))
+
+#define _mm256_extractf128_si256(X, N) \
+ ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X), \
+ (int)(N)))
+
+#define _mm256_extract_epi32(X, N) \
+ (__extension__ \
+ ({ \
+ __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
+ _mm_extract_epi32 (__Y, (N) % 4); \
+ }))
+
+#define _mm256_extract_epi16(X, N) \
+ (__extension__ \
+ ({ \
+ __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
+ _mm_extract_epi16 (__Y, (N) % 8); \
+ }))
+
+#define _mm256_extract_epi8(X, N) \
+ (__extension__ \
+ ({ \
+ __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
+ _mm_extract_epi8 (__Y, (N) % 16); \
+ }))
+
+#ifdef __x86_64__
+#define _mm256_extract_epi64(X, N) \
+ (__extension__ \
+ ({ \
+ __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
+ _mm_extract_epi64 (__Y, (N) % 2); \
+ }))
+#endif
+#endif
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_zeroall (void)
+{
+ __builtin_ia32_vzeroall ();
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_zeroupper (void)
+{
+ __builtin_ia32_vzeroupper ();
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutevar_pd (__m128d __A, __m128i __C)
+{
+ return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A,
+ (__v2di)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutevar_pd (__m256d __A, __m256i __C)
+{
+ return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A,
+ (__v4di)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutevar_ps (__m128 __A, __m128i __C)
+{
+ return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A,
+ (__v4si)__C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutevar_ps (__m256 __A, __m256i __C)
+{
+ return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A,
+ (__v8si)__C);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permute_pd (__m128d __X, const int __C)
+{
+ return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute_pd (__m256d __X, const int __C)
+{
+ return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permute_ps (__m128 __X, const int __C)
+{
+ return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute_ps (__m256 __X, const int __C)
+{
+ return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C);
+}
+#else
+#define _mm_permute_pd(X, C) \
+ ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C)))
+
+#define _mm256_permute_pd(X, C) \
+ ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X), (int)(C)))
+
+#define _mm_permute_ps(X, C) \
+ ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C)))
+
+#define _mm256_permute_ps(X, C) \
+ ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C)))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C)
+{
+ return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X,
+ (__v4df)__Y,
+ __C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C)
+{
+ return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X,
+ (__v8sf)__Y,
+ __C);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C)
+{
+ return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X,
+ (__v8si)__Y,
+ __C);
+}
+#else
+#define _mm256_permute2f128_pd(X, Y, C) \
+ ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X), \
+ (__v4df)(__m256d)(Y), \
+ (int)(C)))
+
+#define _mm256_permute2f128_ps(X, Y, C) \
+ ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X), \
+ (__v8sf)(__m256)(Y), \
+ (int)(C)))
+
+#define _mm256_permute2f128_si256(X, Y, C) \
+ ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X), \
+ (__v8si)(__m256i)(Y), \
+ (int)(C)))
+#endif
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcast_ss (float const *__X)
+{
+ return (__m128) __builtin_ia32_vbroadcastss (__X);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcast_sd (double const *__X)
+{
+ return (__m256d) __builtin_ia32_vbroadcastsd256 (__X);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcast_ss (float const *__X)
+{
+ return (__m256) __builtin_ia32_vbroadcastss256 (__X);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcast_pd (__m128d const *__X)
+{
+ return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcast_ps (__m128 const *__X)
+{
+ return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O)
+{
+ return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X,
+ (__v2df)__Y,
+ __O);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O)
+{
+ return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X,
+ (__v4sf)__Y,
+ __O);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O)
+{
+ return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X,
+ (__v4si)__Y,
+ __O);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_insert_epi32 (__m256i __X, int __D, int const __N)
+{
+ __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
+ __Y = _mm_insert_epi32 (__Y, __D, __N % 4);
+ return _mm256_insertf128_si256 (__X, __Y, __N >> 2);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_insert_epi16 (__m256i __X, int __D, int const __N)
+{
+ __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
+ __Y = _mm_insert_epi16 (__Y, __D, __N % 8);
+ return _mm256_insertf128_si256 (__X, __Y, __N >> 3);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_insert_epi8 (__m256i __X, int __D, int const __N)
+{
+ __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
+ __Y = _mm_insert_epi8 (__Y, __D, __N % 16);
+ return _mm256_insertf128_si256 (__X, __Y, __N >> 4);
+}
+
+#ifdef __x86_64__
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_insert_epi64 (__m256i __X, long long __D, int const __N)
+{
+ __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
+ __Y = _mm_insert_epi64 (__Y, __D, __N % 2);
+ return _mm256_insertf128_si256 (__X, __Y, __N >> 1);
+}
+#endif
+#else
+#define _mm256_insertf128_pd(X, Y, O) \
+ ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X), \
+ (__v2df)(__m128d)(Y), \
+ (int)(O)))
+
+#define _mm256_insertf128_ps(X, Y, O) \
+ ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X), \
+ (__v4sf)(__m128)(Y), \
+ (int)(O)))
+
+#define _mm256_insertf128_si256(X, Y, O) \
+ ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X), \
+ (__v4si)(__m128i)(Y), \
+ (int)(O)))
+
+#define _mm256_insert_epi32(X, D, N) \
+ (__extension__ \
+ ({ \
+ __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \
+ __Y = _mm_insert_epi32 (__Y, (D), (N) % 4); \
+ _mm256_insertf128_si256 ((X), __Y, (N) >> 2); \
+ }))
+
+#define _mm256_insert_epi16(X, D, N) \
+ (__extension__ \
+ ({ \
+ __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \
+ __Y = _mm_insert_epi16 (__Y, (D), (N) % 8); \
+ _mm256_insertf128_si256 ((X), __Y, (N) >> 3); \
+ }))
+
+#define _mm256_insert_epi8(X, D, N) \
+ (__extension__ \
+ ({ \
+ __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \
+ __Y = _mm_insert_epi8 (__Y, (D), (N) % 16); \
+ _mm256_insertf128_si256 ((X), __Y, (N) >> 4); \
+ }))
+
+#ifdef __x86_64__
+#define _mm256_insert_epi64(X, D, N) \
+ (__extension__ \
+ ({ \
+ __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \
+ __Y = _mm_insert_epi64 (__Y, (D), (N) % 2); \
+ _mm256_insertf128_si256 ((X), __Y, (N) >> 1); \
+ }))
+#endif
+#endif
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_load_pd (double const *__P)
+{
+ return *(__m256d *)__P;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_store_pd (double *__P, __m256d __A)
+{
+ *(__m256d *)__P = __A;
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_load_ps (float const *__P)
+{
+ return *(__m256 *)__P;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_store_ps (float *__P, __m256 __A)
+{
+ *(__m256 *)__P = __A;
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu_pd (double const *__P)
+{
+ return *(__m256d_u *)__P;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu_pd (double *__P, __m256d __A)
+{
+ *(__m256d_u *)__P = __A;
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu_ps (float const *__P)
+{
+ return *(__m256_u *)__P;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu_ps (float *__P, __m256 __A)
+{
+ *(__m256_u *)__P = __A;
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_load_si256 (__m256i const *__P)
+{
+ return *__P;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_store_si256 (__m256i *__P, __m256i __A)
+{
+ *__P = __A;
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu_si256 (__m256i_u const *__P)
+{
+ return *__P;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu_si256 (__m256i_u *__P, __m256i __A)
+{
+ *__P = __A;
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskload_pd (double const *__P, __m128i __M)
+{
+ return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P,
+ (__v2di)__M);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskstore_pd (double *__P, __m128i __M, __m128d __A)
+{
+ __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2di)__M, (__v2df)__A);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskload_pd (double const *__P, __m256i __M)
+{
+ return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P,
+ (__v4di)__M);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskstore_pd (double *__P, __m256i __M, __m256d __A)
+{
+ __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4di)__M, (__v4df)__A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskload_ps (float const *__P, __m128i __M)
+{
+ return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P,
+ (__v4si)__M);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskstore_ps (float *__P, __m128i __M, __m128 __A)
+{
+ __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4si)__M, (__v4sf)__A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskload_ps (float const *__P, __m256i __M)
+{
+ return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P,
+ (__v8si)__M);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskstore_ps (float *__P, __m256i __M, __m256 __A)
+{
+ __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8si)__M, (__v8sf)__A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movehdup_ps (__m256 __X)
+{
+ return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_moveldup_ps (__m256 __X)
+{
+ return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movedup_pd (__m256d __X)
+{
+ return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_lddqu_si256 (__m256i const *__P)
+{
+ return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_stream_si256 (__m256i *__A, __m256i __B)
+{
+ __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_stream_pd (double *__A, __m256d __B)
+{
+ __builtin_ia32_movntpd256 (__A, (__v4df)__B);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_stream_ps (float *__P, __m256 __A)
+{
+ __builtin_ia32_movntps256 (__P, (__v8sf)__A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rcp_ps (__m256 __A)
+{
+ return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rsqrt_ps (__m256 __A)
+{
+ return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sqrt_pd (__m256d __A)
+{
+ return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sqrt_ps (__m256 __A)
+{
+ return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_round_pd (__m256d __V, const int __M)
+{
+ return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_round_ps (__m256 __V, const int __M)
+{
+ return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M);
+}
+#else
+#define _mm256_round_pd(V, M) \
+ ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M)))
+
+#define _mm256_round_ps(V, M) \
+ ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M)))
+#endif
+
+#define _mm256_ceil_pd(V) _mm256_round_pd ((V), _MM_FROUND_CEIL)
+#define _mm256_floor_pd(V) _mm256_round_pd ((V), _MM_FROUND_FLOOR)
+#define _mm256_ceil_ps(V) _mm256_round_ps ((V), _MM_FROUND_CEIL)
+#define _mm256_floor_ps(V) _mm256_round_ps ((V), _MM_FROUND_FLOOR)
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpackhi_pd (__m256d __A, __m256d __B)
+{
+ return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpacklo_pd (__m256d __A, __m256d __B)
+{
+ return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpackhi_ps (__m256 __A, __m256 __B)
+{
+ return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpacklo_ps (__m256 __A, __m256 __B)
+{
+ return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testz_pd (__m128d __M, __m128d __V)
+{
+ return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testc_pd (__m128d __M, __m128d __V)
+{
+ return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testnzc_pd (__m128d __M, __m128d __V)
+{
+ return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testz_ps (__m128 __M, __m128 __V)
+{
+ return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testc_ps (__m128 __M, __m128 __V)
+{
+ return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testnzc_ps (__m128 __M, __m128 __V)
+{
+ return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testz_pd (__m256d __M, __m256d __V)
+{
+ return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testc_pd (__m256d __M, __m256d __V)
+{
+ return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testnzc_pd (__m256d __M, __m256d __V)
+{
+ return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testz_ps (__m256 __M, __m256 __V)
+{
+ return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testc_ps (__m256 __M, __m256 __V)
+{
+ return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testnzc_ps (__m256 __M, __m256 __V)
+{
+ return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testz_si256 (__m256i __M, __m256i __V)
+{
+ return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testc_si256 (__m256i __M, __m256i __V)
+{
+ return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testnzc_si256 (__m256i __M, __m256i __V)
+{
+ return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movemask_pd (__m256d __A)
+{
+ return __builtin_ia32_movmskpd256 ((__v4df)__A);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movemask_ps (__m256 __A)
+{
+ return __builtin_ia32_movmskps256 ((__v8sf)__A);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_undefined_pd (void)
+{
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Winit-self"
+ __m256d __Y = __Y;
+#pragma GCC diagnostic pop
+ return __Y;
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_undefined_ps (void)
+{
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Winit-self"
+ __m256 __Y = __Y;
+#pragma GCC diagnostic pop
+ return __Y;
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_undefined_si256 (void)
+{
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Winit-self"
+ __m256i __Y = __Y;
+#pragma GCC diagnostic pop
+ return __Y;
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setzero_pd (void)
+{
+ return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setzero_ps (void)
+{
+ return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0,
+ 0.0, 0.0, 0.0, 0.0 };
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setzero_si256 (void)
+{
+ return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
+}
+
+/* Create the vector [A B C D]. */
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_pd (double __A, double __B, double __C, double __D)
+{
+ return __extension__ (__m256d){ __D, __C, __B, __A };
+}
+
+/* Create the vector [A B C D E F G H]. */
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_ps (float __A, float __B, float __C, float __D,
+ float __E, float __F, float __G, float __H)
+{
+ return __extension__ (__m256){ __H, __G, __F, __E,
+ __D, __C, __B, __A };
+}
+
+/* Create the vector [A B C D E F G H]. */
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_epi32 (int __A, int __B, int __C, int __D,
+ int __E, int __F, int __G, int __H)
+{
+ return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E,
+ __D, __C, __B, __A };
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12,
+ short __q11, short __q10, short __q09, short __q08,
+ short __q07, short __q06, short __q05, short __q04,
+ short __q03, short __q02, short __q01, short __q00)
+{
+ return __extension__ (__m256i)(__v16hi){
+ __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
+ __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
+ };
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_epi8 (char __q31, char __q30, char __q29, char __q28,
+ char __q27, char __q26, char __q25, char __q24,
+ char __q23, char __q22, char __q21, char __q20,
+ char __q19, char __q18, char __q17, char __q16,
+ char __q15, char __q14, char __q13, char __q12,
+ char __q11, char __q10, char __q09, char __q08,
+ char __q07, char __q06, char __q05, char __q04,
+ char __q03, char __q02, char __q01, char __q00)
+{
+ return __extension__ (__m256i)(__v32qi){
+ __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
+ __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
+ __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
+ __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31
+ };
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_epi64x (long long __A, long long __B, long long __C,
+ long long __D)
+{
+ return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A };
+}
+
+/* Create a vector with all elements equal to A. */
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_pd (double __A)
+{
+ return __extension__ (__m256d){ __A, __A, __A, __A };
+}
+
+/* Create a vector with all elements equal to A. */
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_ps (float __A)
+{
+ return __extension__ (__m256){ __A, __A, __A, __A,
+ __A, __A, __A, __A };
+}
+
+/* Create a vector with all elements equal to A. */
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_epi32 (int __A)
+{
+ return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A,
+ __A, __A, __A, __A };
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_epi16 (short __A)
+{
+ return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_epi8 (char __A)
+{
+ return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_epi64x (long long __A)
+{
+ return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A };
+}
+
+/* Create vectors of elements in the reversed order from the
+ _mm256_set_XXX functions. */
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_pd (double __A, double __B, double __C, double __D)
+{
+ return _mm256_set_pd (__D, __C, __B, __A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_ps (float __A, float __B, float __C, float __D,
+ float __E, float __F, float __G, float __H)
+{
+ return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_epi32 (int __A, int __B, int __C, int __D,
+ int __E, int __F, int __G, int __H)
+{
+ return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12,
+ short __q11, short __q10, short __q09, short __q08,
+ short __q07, short __q06, short __q05, short __q04,
+ short __q03, short __q02, short __q01, short __q00)
+{
+ return _mm256_set_epi16 (__q00, __q01, __q02, __q03,
+ __q04, __q05, __q06, __q07,
+ __q08, __q09, __q10, __q11,
+ __q12, __q13, __q14, __q15);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_epi8 (char __q31, char __q30, char __q29, char __q28,
+ char __q27, char __q26, char __q25, char __q24,
+ char __q23, char __q22, char __q21, char __q20,
+ char __q19, char __q18, char __q17, char __q16,
+ char __q15, char __q14, char __q13, char __q12,
+ char __q11, char __q10, char __q09, char __q08,
+ char __q07, char __q06, char __q05, char __q04,
+ char __q03, char __q02, char __q01, char __q00)
+{
+ return _mm256_set_epi8 (__q00, __q01, __q02, __q03,
+ __q04, __q05, __q06, __q07,
+ __q08, __q09, __q10, __q11,
+ __q12, __q13, __q14, __q15,
+ __q16, __q17, __q18, __q19,
+ __q20, __q21, __q22, __q23,
+ __q24, __q25, __q26, __q27,
+ __q28, __q29, __q30, __q31);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_epi64x (long long __A, long long __B, long long __C,
+ long long __D)
+{
+ return _mm256_set_epi64x (__D, __C, __B, __A);
+}
+
+/* Casts between various SP, DP, INT vector types. Note that these do no
+ conversion of values, they just change the type. */
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castpd_ps (__m256d __A)
+{
+ return (__m256) __A;
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castpd_si256 (__m256d __A)
+{
+ return (__m256i) __A;
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castps_pd (__m256 __A)
+{
+ return (__m256d) __A;
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castps_si256(__m256 __A)
+{
+ return (__m256i) __A;
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castsi256_ps (__m256i __A)
+{
+ return (__m256) __A;
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castsi256_pd (__m256i __A)
+{
+ return (__m256d) __A;
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castpd256_pd128 (__m256d __A)
+{
+ return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castps256_ps128 (__m256 __A)
+{
+ return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castsi256_si128 (__m256i __A)
+{
+ return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A);
+}
+
+/* When cast is done from a 128 to 256-bit type, the low 128 bits of
+ the 256-bit result contain source parameter value and the upper 128
+ bits of the result are undefined. Those intrinsics shouldn't
+ generate any extra moves. */
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castpd128_pd256 (__m128d __A)
+{
+ return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castps128_ps256 (__m128 __A)
+{
+ return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castsi128_si256 (__m128i __A)
+{
+ return (__m256i) __builtin_ia32_si256_si ((__v4si)__A);
+}
+
+/* Similarly, but with zero extension instead of undefined values. */
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_zextpd128_pd256 (__m128d __A)
+{
+ return _mm256_insertf128_pd (_mm256_setzero_pd (), __A, 0);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_zextps128_ps256 (__m128 __A)
+{
+ return _mm256_insertf128_ps (_mm256_setzero_ps (), __A, 0);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_zextsi128_si256 (__m128i __A)
+{
+ return _mm256_insertf128_si256 (_mm256_setzero_si256 (), __A, 0);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_m128 ( __m128 __H, __m128 __L)
+{
+ return _mm256_insertf128_ps (_mm256_castps128_ps256 (__L), __H, 1);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_m128d (__m128d __H, __m128d __L)
+{
+ return _mm256_insertf128_pd (_mm256_castpd128_pd256 (__L), __H, 1);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_m128i (__m128i __H, __m128i __L)
+{
+ return _mm256_insertf128_si256 (_mm256_castsi128_si256 (__L), __H, 1);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_m128 (__m128 __L, __m128 __H)
+{
+ return _mm256_set_m128 (__H, __L);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_m128d (__m128d __L, __m128d __H)
+{
+ return _mm256_set_m128d (__H, __L);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_m128i (__m128i __L, __m128i __H)
+{
+ return _mm256_set_m128i (__H, __L);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu2_m128 (float const *__PH, float const *__PL)
+{
+ return _mm256_insertf128_ps (_mm256_castps128_ps256 (_mm_loadu_ps (__PL)),
+ _mm_loadu_ps (__PH), 1);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu2_m128 (float *__PH, float *__PL, __m256 __A)
+{
+ _mm_storeu_ps (__PL, _mm256_castps256_ps128 (__A));
+ _mm_storeu_ps (__PH, _mm256_extractf128_ps (__A, 1));
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu2_m128d (double const *__PH, double const *__PL)
+{
+ return _mm256_insertf128_pd (_mm256_castpd128_pd256 (_mm_loadu_pd (__PL)),
+ _mm_loadu_pd (__PH), 1);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu2_m128d (double *__PH, double *__PL, __m256d __A)
+{
+ _mm_storeu_pd (__PL, _mm256_castpd256_pd128 (__A));
+ _mm_storeu_pd (__PH, _mm256_extractf128_pd (__A, 1));
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu2_m128i (__m128i_u const *__PH, __m128i_u const *__PL)
+{
+ return _mm256_insertf128_si256 (_mm256_castsi128_si256 (_mm_loadu_si128 (__PL)),
+ _mm_loadu_si128 (__PH), 1);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu2_m128i (__m128i_u *__PH, __m128i_u *__PL, __m256i __A)
+{
+ _mm_storeu_si128 (__PL, _mm256_castsi256_si128 (__A));
+ _mm_storeu_si128 (__PH, _mm256_extractf128_si256 (__A, 1));
+}
+
+#ifdef __DISABLE_AVX__
+#undef __DISABLE_AVX__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX__ */
+
+#endif /* _AVXINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2021-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avxneconvertintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVXNECONVERTINTRIN_H_INCLUDED
+#define _AVXNECONVERTINTRIN_H_INCLUDED
+
+#ifndef __AVXNECONVERT__
+#pragma GCC push_options
+#pragma GCC target ("avxneconvert")
+#define __DISABLE_AVXNECONVERT__
+#endif /* __AVXNECONVERT__ */
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_bcstnebf16_ps (const void *__P)
+{
+ return (__m128) __builtin_ia32_vbcstnebf162ps128 ((const __bf16 *) __P);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_bcstnebf16_ps (const void *__P)
+{
+ return (__m256) __builtin_ia32_vbcstnebf162ps256 ((const __bf16 *) __P);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_bcstnesh_ps (const void *__P)
+{
+ return (__m128) __builtin_ia32_vbcstnesh2ps128 ((const _Float16 *) __P);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_bcstnesh_ps (const void *__P)
+{
+ return (__m256) __builtin_ia32_vbcstnesh2ps256 ((const _Float16 *) __P);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtneebf16_ps (const __m128bh *__A)
+{
+ return (__m128) __builtin_ia32_vcvtneebf162ps128 ((const __v8bf *) __A);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtneebf16_ps (const __m256bh *__A)
+{
+ return (__m256) __builtin_ia32_vcvtneebf162ps256 ((const __v16bf *) __A);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtneeph_ps (const __m128h *__A)
+{
+ return (__m128) __builtin_ia32_vcvtneeph2ps128 ((const __v8hf *) __A);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtneeph_ps (const __m256h *__A)
+{
+ return (__m256) __builtin_ia32_vcvtneeph2ps256 ((const __v16hf *) __A);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtneobf16_ps (const __m128bh *__A)
+{
+ return (__m128) __builtin_ia32_vcvtneobf162ps128 ((const __v8bf *) __A);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtneobf16_ps (const __m256bh *__A)
+{
+ return (__m256) __builtin_ia32_vcvtneobf162ps256 ((const __v16bf *) __A);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtneoph_ps (const __m128h *__A)
+{
+ return (__m128) __builtin_ia32_vcvtneoph2ps128 ((const __v8hf *) __A);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtneoph_ps (const __m256h *__A)
+{
+ return (__m256) __builtin_ia32_vcvtneoph2ps256 ((const __v16hf *) __A);
+}
+
+extern __inline __m128bh
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtneps_avx_pbh (__m128 __A)
+{
+ return (__m128bh) __builtin_ia32_cvtneps2bf16_v4sf (__A);
+}
+
+extern __inline __m128bh
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtneps_avx_pbh (__m256 __A)
+{
+ return (__m128bh) __builtin_ia32_cvtneps2bf16_v8sf (__A);
+}
+
+#ifdef __DISABLE_AVXNECONVERT__
+#undef __DISABLE_AVXNECONVERT__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVXNECONVERT__ */
+
+#endif /* _AVXNECONVERTINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+#error "Never use <avxvnniint8vlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVXVNNIINT8INTRIN_H_INCLUDED
+#define _AVXVNNIINT8INTRIN_H_INCLUDED
+
+#if !defined(__AVXVNNIINT8__)
+#pragma GCC push_options
+#pragma GCC target("avxvnniint8")
+#define __DISABLE_AVXVNNIINT8__
+#endif /* __AVXVNNIINT8__ */
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_dpbssd_epi32 (__m128i __W, __m128i __A, __m128i __B)
+{
+ return (__m128i)
+ __builtin_ia32_vpdpbssd128 ((__v4si) __W, (__v4si) __A, (__v4si) __B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_dpbssds_epi32 (__m128i __W, __m128i __A, __m128i __B)
+{
+ return (__m128i)
+ __builtin_ia32_vpdpbssds128 ((__v4si) __W, (__v4si) __A, (__v4si) __B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_dpbsud_epi32 (__m128i __W, __m128i __A, __m128i __B)
+{
+ return (__m128i)
+ __builtin_ia32_vpdpbsud128 ((__v4si) __W, (__v4si) __A, (__v4si) __B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_dpbsuds_epi32 (__m128i __W, __m128i __A, __m128i __B)
+{
+ return (__m128i)
+ __builtin_ia32_vpdpbsuds128 ((__v4si) __W, (__v4si) __A, (__v4si) __B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_dpbuud_epi32 (__m128i __W, __m128i __A, __m128i __B)
+{
+ return (__m128i)
+ __builtin_ia32_vpdpbuud128 ((__v4si) __W, (__v4si) __A, (__v4si) __B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_dpbuuds_epi32 (__m128i __W, __m128i __A, __m128i __B)
+{
+ return (__m128i)
+ __builtin_ia32_vpdpbuuds128 ((__v4si) __W, (__v4si) __A, (__v4si) __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_dpbssd_epi32 (__m256i __W, __m256i __A, __m256i __B)
+{
+ return (__m256i)
+ __builtin_ia32_vpdpbssd256 ((__v8si) __W, (__v8si) __A, (__v8si) __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_dpbssds_epi32 (__m256i __W, __m256i __A, __m256i __B)
+{
+ return (__m256i)
+ __builtin_ia32_vpdpbssds256 ((__v8si) __W, (__v8si) __A, (__v8si) __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_dpbsud_epi32 (__m256i __W, __m256i __A, __m256i __B)
+{
+ return (__m256i)
+ __builtin_ia32_vpdpbsud256 ((__v8si) __W, (__v8si) __A, (__v8si) __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_dpbsuds_epi32 (__m256i __W, __m256i __A, __m256i __B)
+{
+ return (__m256i)
+ __builtin_ia32_vpdpbsuds256 ((__v8si) __W, (__v8si) __A, (__v8si) __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_dpbuud_epi32 (__m256i __W, __m256i __A, __m256i __B)
+{
+ return (__m256i)
+ __builtin_ia32_vpdpbuud256 ((__v8si) __W, (__v8si) __A, (__v8si) __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_dpbuuds_epi32 (__m256i __W, __m256i __A, __m256i __B)
+{
+ return (__m256i)
+ __builtin_ia32_vpdpbuuds256 ((__v8si) __W, (__v8si) __A, (__v8si) __B);
+}
+
+#ifdef __DISABLE_AVXVNNIINT8__
+#undef __DISABLE_AVXVNNIINT8__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVXVNNIINT8__ */
+
+#endif /* __AVXVNNIINT8INTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avxvnniintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVXVNNIINTRIN_H_INCLUDED
+#define _AVXVNNIINTRIN_H_INCLUDED
+
+#if !defined(__AVXVNNI__)
+#pragma GCC push_options
+#pragma GCC target("avxvnni")
+#define __DISABLE_AVXVNNIVL__
+#endif /* __AVXVNNIVL__ */
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_dpbusd_avx_epi32(__m256i __A, __m256i __B, __m256i __C)
+{
+ return (__m256i) __builtin_ia32_vpdpbusd_v8si ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si) __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_dpbusd_avx_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i) __builtin_ia32_vpdpbusd_v4si ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si) __C);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_dpbusds_avx_epi32(__m256i __A, __m256i __B, __m256i __C)
+{
+ return (__m256i) __builtin_ia32_vpdpbusds_v8si ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si) __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_dpbusds_avx_epi32(__m128i __A,__m128i __B,__m128i __C)
+{
+ return (__m128i) __builtin_ia32_vpdpbusds_v4si ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si) __C);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_dpwssd_avx_epi32(__m256i __A,__m256i __B,__m256i __C)
+{
+ return (__m256i) __builtin_ia32_vpdpwssd_v8si ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si) __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_dpwssd_avx_epi32(__m128i __A,__m128i __B,__m128i __C)
+{
+ return (__m128i) __builtin_ia32_vpdpwssd_v4si ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si) __C);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_dpwssds_avx_epi32(__m256i __A,__m256i __B,__m256i __C)
+{
+ return (__m256i) __builtin_ia32_vpdpwssds_v8si ((__v8si) __A,
+ (__v8si) __B,
+ (__v8si) __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_dpwssds_avx_epi32(__m128i __A,__m128i __B,__m128i __C)
+{
+ return (__m128i) __builtin_ia32_vpdpwssds_v4si ((__v4si) __A,
+ (__v4si) __B,
+ (__v4si) __C);
+}
+
+#ifdef __DISABLE_AVXVNNIVL__
+#undef __DISABLE_AVXVNNIVL__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVXVNNIVL__ */
+#endif /* _AVXVNNIINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2011-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _BMI2INTRIN_H_INCLUDED
+#define _BMI2INTRIN_H_INCLUDED
+
+#ifndef __BMI2__
+#pragma GCC push_options
+#pragma GCC target("bmi2")
+#define __DISABLE_BMI2__
+#endif /* __BMI2__ */
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bzhi_u32 (unsigned int __X, unsigned int __Y)
+{
+ return __builtin_ia32_bzhi_si (__X, __Y);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pdep_u32 (unsigned int __X, unsigned int __Y)
+{
+ return __builtin_ia32_pdep_si (__X, __Y);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pext_u32 (unsigned int __X, unsigned int __Y)
+{
+ return __builtin_ia32_pext_si (__X, __Y);
+}
+
+#ifdef __x86_64__
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bzhi_u64 (unsigned long long __X, unsigned long long __Y)
+{
+ return __builtin_ia32_bzhi_di (__X, __Y);
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pdep_u64 (unsigned long long __X, unsigned long long __Y)
+{
+ return __builtin_ia32_pdep_di (__X, __Y);
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pext_u64 (unsigned long long __X, unsigned long long __Y)
+{
+ return __builtin_ia32_pext_di (__X, __Y);
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mulx_u64 (unsigned long long __X, unsigned long long __Y,
+ unsigned long long *__P)
+{
+ unsigned __int128 __res = (unsigned __int128) __X * __Y;
+ *__P = (unsigned long long) (__res >> 64);
+ return (unsigned long long) __res;
+}
+
+#else /* !__x86_64__ */
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P)
+{
+ unsigned long long __res = (unsigned long long) __X * __Y;
+ *__P = (unsigned int) (__res >> 32);
+ return (unsigned int) __res;
+}
+
+#endif /* !__x86_64__ */
+
+#ifdef __DISABLE_BMI2__
+#undef __DISABLE_BMI2__
+#pragma GCC pop_options
+#endif /* __DISABLE_BMI2__ */
+
+#endif /* _BMI2INTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2010-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <bmiintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _BMIINTRIN_H_INCLUDED
+#define _BMIINTRIN_H_INCLUDED
+
+#ifndef __BMI__
+#pragma GCC push_options
+#pragma GCC target("bmi")
+#define __DISABLE_BMI__
+#endif /* __BMI__ */
+
+extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__tzcnt_u16 (unsigned short __X)
+{
+ return __builtin_ia32_tzcnt_u16 (__X);
+}
+
+extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_tzcnt_u16 (unsigned short __X)
+{
+ return __builtin_ia32_tzcnt_u16 (__X);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__andn_u32 (unsigned int __X, unsigned int __Y)
+{
+ return ~__X & __Y;
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_andn_u32 (unsigned int __X, unsigned int __Y)
+{
+ return __andn_u32 (__X, __Y);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bextr_u32 (unsigned int __X, unsigned int __Y)
+{
+ return __builtin_ia32_bextr_u32 (__X, __Y);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bextr_u32 (unsigned int __X, unsigned int __Y, unsigned __Z)
+{
+ return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsi_u32 (unsigned int __X)
+{
+ return __X & -__X;
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_blsi_u32 (unsigned int __X)
+{
+ return __blsi_u32 (__X);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsmsk_u32 (unsigned int __X)
+{
+ return __X ^ (__X - 1);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_blsmsk_u32 (unsigned int __X)
+{
+ return __blsmsk_u32 (__X);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsr_u32 (unsigned int __X)
+{
+ return __X & (__X - 1);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_blsr_u32 (unsigned int __X)
+{
+ return __blsr_u32 (__X);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__tzcnt_u32 (unsigned int __X)
+{
+ return __builtin_ia32_tzcnt_u32 (__X);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_tzcnt_u32 (unsigned int __X)
+{
+ return __builtin_ia32_tzcnt_u32 (__X);
+}
+
+
+#ifdef __x86_64__
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__andn_u64 (unsigned long long __X, unsigned long long __Y)
+{
+ return ~__X & __Y;
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_andn_u64 (unsigned long long __X, unsigned long long __Y)
+{
+ return __andn_u64 (__X, __Y);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bextr_u64 (unsigned long long __X, unsigned long long __Y)
+{
+ return __builtin_ia32_bextr_u64 (__X, __Y);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bextr_u64 (unsigned long long __X, unsigned int __Y, unsigned int __Z)
+{
+ return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsi_u64 (unsigned long long __X)
+{
+ return __X & -__X;
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_blsi_u64 (unsigned long long __X)
+{
+ return __blsi_u64 (__X);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsmsk_u64 (unsigned long long __X)
+{
+ return __X ^ (__X - 1);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_blsmsk_u64 (unsigned long long __X)
+{
+ return __blsmsk_u64 (__X);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsr_u64 (unsigned long long __X)
+{
+ return __X & (__X - 1);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_blsr_u64 (unsigned long long __X)
+{
+ return __blsr_u64 (__X);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__tzcnt_u64 (unsigned long long __X)
+{
+ return __builtin_ia32_tzcnt_u64 (__X);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_tzcnt_u64 (unsigned long long __X)
+{
+ return __builtin_ia32_tzcnt_u64 (__X);
+}
+
+#endif /* __x86_64__ */
+
+#ifdef __DISABLE_BMI__
+#undef __DISABLE_BMI__
+#pragma GCC pop_options
+#endif /* __DISABLE_BMI__ */
+
+#endif /* _BMIINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2015-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <cetintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _CETINTRIN_H_INCLUDED
+#define _CETINTRIN_H_INCLUDED
+
+#ifndef __SHSTK__
+#pragma GCC push_options
+#pragma GCC target ("shstk")
+#define __DISABLE_SHSTK__
+#endif /* __SHSTK__ */
+
+#ifdef __x86_64__
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_get_ssp (void)
+{
+ return __builtin_ia32_rdsspq ();
+}
+#else
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_get_ssp (void)
+{
+ return __builtin_ia32_rdsspd ();
+}
+#endif
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_inc_ssp (unsigned int __B)
+{
+#ifdef __x86_64__
+ __builtin_ia32_incsspq ((unsigned long long) __B);
+#else
+ __builtin_ia32_incsspd (__B);
+#endif
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_saveprevssp (void)
+{
+ __builtin_ia32_saveprevssp ();
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_rstorssp (void *__B)
+{
+ __builtin_ia32_rstorssp (__B);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_wrssd (unsigned int __B, void *__C)
+{
+ __builtin_ia32_wrssd (__B, __C);
+}
+
+#ifdef __x86_64__
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_wrssq (unsigned long long __B, void *__C)
+{
+ __builtin_ia32_wrssq (__B, __C);
+}
+#endif
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_wrussd (unsigned int __B, void *__C)
+{
+ __builtin_ia32_wrussd (__B, __C);
+}
+
+#ifdef __x86_64__
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_wrussq (unsigned long long __B, void *__C)
+{
+ __builtin_ia32_wrussq (__B, __C);
+}
+#endif
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_setssbsy (void)
+{
+ __builtin_ia32_setssbsy ();
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_clrssbsy (void *__B)
+{
+ __builtin_ia32_clrssbsy (__B);
+}
+
+#ifdef __DISABLE_SHSTK__
+#undef __DISABLE_SHSTK__
+#pragma GCC pop_options
+#endif /* __DISABLE_SHSTK__ */
+
+#endif /* _CETINTRIN_H_INCLUDED. */
--- /dev/null
+/* Copyright (C) 2018-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <cldemoteintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _CLDEMOTE_H_INCLUDED
+#define _CLDEMOTE_H_INCLUDED
+
+#ifndef __CLDEMOTE__
+#pragma GCC push_options
+#pragma GCC target("cldemote")
+#define __DISABLE_CLDEMOTE__
+#endif /* __CLDEMOTE__ */
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_cldemote (void *__A)
+{
+ __builtin_ia32_cldemote (__A);
+}
+#ifdef __DISABLE_CLDEMOTE__
+#undef __DISABLE_CLDEMOTE__
+#pragma GCC pop_options
+#endif /* __DISABLE_CLDEMOTE__ */
+
+#endif /* _CLDEMOTE_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2013-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <clflushoptintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _CLFLUSHOPTINTRIN_H_INCLUDED
+#define _CLFLUSHOPTINTRIN_H_INCLUDED
+
+#ifndef __CLFLUSHOPT__
+#pragma GCC push_options
+#pragma GCC target("clflushopt")
+#define __DISABLE_CLFLUSHOPT__
+#endif /* __CLFLUSHOPT__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_clflushopt (void *__A)
+{
+ __builtin_ia32_clflushopt (__A);
+}
+
+#ifdef __DISABLE_CLFLUSHOPT__
+#undef __DISABLE_CLFLUSHOPT__
+#pragma GCC pop_options
+#endif /* __DISABLE_CLFLUSHOPT__ */
+
+#endif /* _CLFLUSHOPTINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2013-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <clwbintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _CLWBINTRIN_H_INCLUDED
+#define _CLWBINTRIN_H_INCLUDED
+
+#ifndef __CLWB__
+#pragma GCC push_options
+#pragma GCC target("clwb")
+#define __DISABLE_CLWB__
+#endif /* __CLWB__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_clwb (void *__A)
+{
+ __builtin_ia32_clwb (__A);
+}
+
+#ifdef __DISABLE_CLWB__
+#undef __DISABLE_CLWB__
+#pragma GCC pop_options
+#endif /* __DISABLE_CLWB__ */
+
+#endif /* _CLWBINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _CLZEROINTRIN_H_INCLUDED
+#define _CLZEROINTRIN_H_INCLUDED
+
+#ifndef __CLZERO__
+#pragma GCC push_options
+#pragma GCC target("clzero")
+#define __DISABLE_CLZERO__
+#endif /* __CLZERO__ */
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_clzero (void * __I)
+{
+ __builtin_ia32_clzero (__I);
+}
+
+#ifdef __DISABLE_CLZERO__
+#undef __DISABLE_CLZERO__
+#pragma GCC pop_options
+#endif /* __DISABLE_CLZERO__ */
+
+#endif /* _CLZEROINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+#error "Never use <cmpccxaddintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _CMPCCXADDINTRIN_H_INCLUDED
+#define _CMPCCXADDINTRIN_H_INCLUDED
+
+#ifdef __x86_64__
+
+#ifndef __CMPCCXADD__
+#pragma GCC push_options
+#pragma GCC target("cmpccxadd")
+#define __DISABLE_CMPCCXADD__
+#endif /* __CMPCCXADD__ */
+
+typedef enum {
+ _CMPCCX_O, /* Overflow. */
+ _CMPCCX_NO, /* No overflow. */
+ _CMPCCX_B, /* Below. */
+ _CMPCCX_NB, /* Not below. */
+ _CMPCCX_Z, /* Zero. */
+ _CMPCCX_NZ, /* Not zero. */
+ _CMPCCX_BE, /* Below or equal. */
+ _CMPCCX_NBE, /* Neither below nor equal. */
+ _CMPCCX_S, /* Sign. */
+ _CMPCCX_NS, /* No sign. */
+ _CMPCCX_P, /* Parity. */
+ _CMPCCX_NP, /* No parity. */
+ _CMPCCX_L, /* Less. */
+ _CMPCCX_NL, /* Not less. */
+ _CMPCCX_LE, /* Less or equal. */
+ _CMPCCX_NLE, /* Neither less nor equal. */
+} _CMPCCX_ENUM;
+
+#ifdef __OPTIMIZE__
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_cmpccxadd_epi32 (int *__A, int __B, int __C, const _CMPCCX_ENUM __D)
+{
+ return __builtin_ia32_cmpccxadd (__A, __B, __C, __D);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_cmpccxadd_epi64 (long long *__A, long long __B, long long __C,
+ const _CMPCCX_ENUM __D)
+{
+ return __builtin_ia32_cmpccxadd64 (__A, __B, __C, __D);
+}
+#else
+#define _cmpccxadd_epi32(A,B,C,D) \
+ __builtin_ia32_cmpccxadd ((int *) (A), (int) (B), (int) (C), \
+ (_CMPCCX_ENUM) (D))
+#define _cmpccxadd_epi64(A,B,C,D) \
+ __builtin_ia32_cmpccxadd64 ((long long *) (A), (long long) (B), \
+ (long long) (C), (_CMPCCX_ENUM) (D))
+#endif
+
+#ifdef __DISABLE_CMPCCXADD__
+#undef __DISABLE_CMPCCXADD__
+#pragma GCC pop_options
+#endif /* __DISABLE_CMPCCXADD__ */
+
+#endif
+
+#endif /* _CMPCCXADDINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2003-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+ User Guide and Reference, version 9.0. */
+
+#ifndef _EMMINTRIN_H_INCLUDED
+#define _EMMINTRIN_H_INCLUDED
+
+/* We need definitions from the SSE header files*/
+#include <xmmintrin.h>
+
+#ifndef __SSE2__
+#pragma GCC push_options
+#pragma GCC target("sse2")
+#define __DISABLE_SSE2__
+#endif /* __SSE2__ */
+
+/* SSE2 */
+typedef double __v2df __attribute__ ((__vector_size__ (16)));
+typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
+typedef int __v4si __attribute__ ((__vector_size__ (16)));
+typedef unsigned int __v4su __attribute__ ((__vector_size__ (16)));
+typedef short __v8hi __attribute__ ((__vector_size__ (16)));
+typedef unsigned short __v8hu __attribute__ ((__vector_size__ (16)));
+typedef char __v16qi __attribute__ ((__vector_size__ (16)));
+typedef signed char __v16qs __attribute__ ((__vector_size__ (16)));
+typedef unsigned char __v16qu __attribute__ ((__vector_size__ (16)));
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+ vector types, and their scalar components. */
+typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
+typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
+
+/* Unaligned version of the same types. */
+typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
+typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
+
+/* Create a selector for use with the SHUFPD instruction. */
+#define _MM_SHUFFLE2(fp1,fp0) \
+ (((fp1) << 1) | (fp0))
+
+/* Create a vector with element 0 as F and the rest zero. */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_sd (double __F)
+{
+ return __extension__ (__m128d){ __F, 0.0 };
+}
+
+/* Create a vector with both elements equal to F. */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_pd (double __F)
+{
+ return __extension__ (__m128d){ __F, __F };
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pd1 (double __F)
+{
+ return _mm_set1_pd (__F);
+}
+
+/* Create a vector with the lower value X and upper value W. */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pd (double __W, double __X)
+{
+ return __extension__ (__m128d){ __X, __W };
+}
+
+/* Create a vector with the lower value W and upper value X. */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_pd (double __W, double __X)
+{
+ return __extension__ (__m128d){ __W, __X };
+}
+
+/* Create an undefined vector. */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_undefined_pd (void)
+{
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Winit-self"
+ __m128d __Y = __Y;
+#pragma GCC diagnostic pop
+ return __Y;
+}
+
+/* Create a vector of zeros. */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setzero_pd (void)
+{
+ return __extension__ (__m128d){ 0.0, 0.0 };
+}
+
+/* Sets the low DPFP value of A from the low value of B. */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_move_sd (__m128d __A, __m128d __B)
+{
+ return __extension__ (__m128d) __builtin_shuffle ((__v2df)__A, (__v2df)__B, (__v2di){2, 1});
+}
+
+/* Load two DPFP values from P. The address must be 16-byte aligned. */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_pd (double const *__P)
+{
+ return *(__m128d *)__P;
+}
+
+/* Load two DPFP values from P. The address need not be 16-byte aligned. */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_pd (double const *__P)
+{
+ return *(__m128d_u *)__P;
+}
+
+/* Create a vector with all two elements equal to *P. */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load1_pd (double const *__P)
+{
+ return _mm_set1_pd (*__P);
+}
+
+/* Create a vector with element 0 as *P and the rest zero. */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_sd (double const *__P)
+{
+ return _mm_set_sd (*__P);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_pd1 (double const *__P)
+{
+ return _mm_load1_pd (__P);
+}
+
+/* Load two DPFP values in reverse order. The address must be aligned. */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadr_pd (double const *__P)
+{
+ __m128d __tmp = _mm_load_pd (__P);
+ return __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,1));
+}
+
+/* Store two DPFP values. The address must be 16-byte aligned. */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_pd (double *__P, __m128d __A)
+{
+ *(__m128d *)__P = __A;
+}
+
+/* Store two DPFP values. The address need not be 16-byte aligned. */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_pd (double *__P, __m128d __A)
+{
+ *(__m128d_u *)__P = __A;
+}
+
+/* Stores the lower DPFP value. */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_sd (double *__P, __m128d __A)
+{
+ *__P = ((__v2df)__A)[0];
+}
+
+extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_f64 (__m128d __A)
+{
+ return ((__v2df)__A)[0];
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storel_pd (double *__P, __m128d __A)
+{
+ _mm_store_sd (__P, __A);
+}
+
+/* Stores the upper DPFP value. */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeh_pd (double *__P, __m128d __A)
+{
+ *__P = ((__v2df)__A)[1];
+}
+
+/* Store the lower DPFP value across two words.
+ The address must be 16-byte aligned. */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store1_pd (double *__P, __m128d __A)
+{
+ _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,0)));
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_pd1 (double *__P, __m128d __A)
+{
+ _mm_store1_pd (__P, __A);
+}
+
+/* Store two DPFP values in reverse order. The address must be aligned. */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storer_pd (double *__P, __m128d __A)
+{
+ _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,1)));
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi128_si32 (__m128i __A)
+{
+ return __builtin_ia32_vec_ext_v4si ((__v4si)__A, 0);
+}
+
+#ifdef __x86_64__
+/* Intel intrinsic. */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi128_si64 (__m128i __A)
+{
+ return ((__v2di)__A)[0];
+}
+
+/* Microsoft intrinsic. */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi128_si64x (__m128i __A)
+{
+ return ((__v2di)__A)[0];
+}
+#endif
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_pd (__m128d __A, __m128d __B)
+{
+ return (__m128d) ((__v2df)__A + (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_sd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_pd (__m128d __A, __m128d __B)
+{
+ return (__m128d) ((__v2df)__A - (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_sd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_pd (__m128d __A, __m128d __B)
+{
+ return (__m128d) ((__v2df)__A * (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_sd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_pd (__m128d __A, __m128d __B)
+{
+ return (__m128d) ((__v2df)__A / (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_sd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_pd (__m128d __A)
+{
+ return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A);
+}
+
+/* Return pair {sqrt (B[0]), A[1]}. */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_sd (__m128d __A, __m128d __B)
+{
+ __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
+ return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_pd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_minpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_sd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_minsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_pd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_maxpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_sd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_maxsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_and_pd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_andpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_andnot_pd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_andnpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_or_pd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_orpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_xor_pd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_pd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_pd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_pd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_pd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_pd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_pd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnlt_pd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnle_pd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_cmpnlepd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpngt_pd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_cmpngtpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnge_pd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_cmpngepd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpord_pd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_cmpordpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpunord_pd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_cmpunordpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_sd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_cmpeqsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_sd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_cmpltsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_sd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_cmplesd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_sd (__m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
+ (__v2df)
+ __builtin_ia32_cmpltsd ((__v2df) __B,
+ (__v2df)
+ __A));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_sd (__m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
+ (__v2df)
+ __builtin_ia32_cmplesd ((__v2df) __B,
+ (__v2df)
+ __A));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_sd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_cmpneqsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnlt_sd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_cmpnltsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnle_sd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_cmpnlesd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpngt_sd (__m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
+ (__v2df)
+ __builtin_ia32_cmpnltsd ((__v2df) __B,
+ (__v2df)
+ __A));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnge_sd (__m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
+ (__v2df)
+ __builtin_ia32_cmpnlesd ((__v2df) __B,
+ (__v2df)
+ __A));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpord_sd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_cmpordsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpunord_sd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_cmpunordsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comieq_sd (__m128d __A, __m128d __B)
+{
+ return __builtin_ia32_comisdeq ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comilt_sd (__m128d __A, __m128d __B)
+{
+ return __builtin_ia32_comisdlt ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comile_sd (__m128d __A, __m128d __B)
+{
+ return __builtin_ia32_comisdle ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comigt_sd (__m128d __A, __m128d __B)
+{
+ return __builtin_ia32_comisdgt ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comige_sd (__m128d __A, __m128d __B)
+{
+ return __builtin_ia32_comisdge ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comineq_sd (__m128d __A, __m128d __B)
+{
+ return __builtin_ia32_comisdneq ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomieq_sd (__m128d __A, __m128d __B)
+{
+ return __builtin_ia32_ucomisdeq ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomilt_sd (__m128d __A, __m128d __B)
+{
+ return __builtin_ia32_ucomisdlt ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomile_sd (__m128d __A, __m128d __B)
+{
+ return __builtin_ia32_ucomisdle ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomigt_sd (__m128d __A, __m128d __B)
+{
+ return __builtin_ia32_ucomisdgt ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomige_sd (__m128d __A, __m128d __B)
+{
+ return __builtin_ia32_ucomisdge ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomineq_sd (__m128d __A, __m128d __B)
+{
+ return __builtin_ia32_ucomisdneq ((__v2df)__A, (__v2df)__B);
+}
+
+/* Create a vector of Qi, where i is the element number. */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_epi64x (long long __q1, long long __q0)
+{
+ return __extension__ (__m128i)(__v2di){ __q0, __q1 };
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_epi64 (__m64 __q1, __m64 __q0)
+{
+ return _mm_set_epi64x ((long long)__q1, (long long)__q0);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
+{
+ return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
+ short __q3, short __q2, short __q1, short __q0)
+{
+ return __extension__ (__m128i)(__v8hi){
+ __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
+ char __q11, char __q10, char __q09, char __q08,
+ char __q07, char __q06, char __q05, char __q04,
+ char __q03, char __q02, char __q01, char __q00)
+{
+ return __extension__ (__m128i)(__v16qi){
+ __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
+ __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
+ };
+}
+
+/* Set all of the elements of the vector to A. */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_epi64x (long long __A)
+{
+ return _mm_set_epi64x (__A, __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_epi64 (__m64 __A)
+{
+ return _mm_set_epi64 (__A, __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_epi32 (int __A)
+{
+ return _mm_set_epi32 (__A, __A, __A, __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_epi16 (short __A)
+{
+ return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_epi8 (char __A)
+{
+ return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A);
+}
+
+/* Create a vector of Qi, where i is the element number.
+ The parameter order is reversed from the _mm_set_epi* functions. */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_epi64 (__m64 __q0, __m64 __q1)
+{
+ return _mm_set_epi64 (__q1, __q0);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
+{
+ return _mm_set_epi32 (__q3, __q2, __q1, __q0);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
+ short __q4, short __q5, short __q6, short __q7)
+{
+ return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
+ char __q04, char __q05, char __q06, char __q07,
+ char __q08, char __q09, char __q10, char __q11,
+ char __q12, char __q13, char __q14, char __q15)
+{
+ return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
+ __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
+}
+
+/* Create a vector with element 0 as *P and the rest zero. */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_si128 (__m128i const *__P)
+{
+ return *__P;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_si128 (__m128i_u const *__P)
+{
+ return *__P;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadl_epi64 (__m128i_u const *__P)
+{
+ return _mm_set_epi64 ((__m64)0LL, *(__m64_u *)__P);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_si64 (void const *__P)
+{
+ return _mm_loadl_epi64 ((__m128i_u *)__P);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_si32 (void const *__P)
+{
+ return _mm_set_epi32 (0, 0, 0, (*(__m32_u *)__P)[0]);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_si16 (void const *__P)
+{
+ return _mm_set_epi16 (0, 0, 0, 0, 0, 0, 0, (*(__m16_u *)__P)[0]);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_si128 (__m128i *__P, __m128i __B)
+{
+ *__P = __B;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_si128 (__m128i_u *__P, __m128i __B)
+{
+ *__P = __B;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storel_epi64 (__m128i_u *__P, __m128i __B)
+{
+ *(__m64_u *)__P = (__m64) ((__v2di)__B)[0];
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_si64 (void *__P, __m128i __B)
+{
+ _mm_storel_epi64 ((__m128i_u *)__P, __B);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_si32 (void *__P, __m128i __B)
+{
+ *(__m32_u *)__P = (__m32) ((__v4si)__B)[0];
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_si16 (void *__P, __m128i __B)
+{
+ *(__m16_u *)__P = (__m16) ((__v8hi)__B)[0];
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movepi64_pi64 (__m128i __B)
+{
+ return (__m64) ((__v2di)__B)[0];
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movpi64_epi64 (__m64 __A)
+{
+ return _mm_set_epi64 ((__m64)0LL, __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_move_epi64 (__m128i __A)
+{
+ return (__m128i)__builtin_ia32_movq128 ((__v2di) __A);
+}
+
+/* Create an undefined vector. */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_undefined_si128 (void)
+{
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Winit-self"
+ __m128i __Y = __Y;
+#pragma GCC diagnostic pop
+ return __Y;
+}
+
+/* Create a vector of zeros. */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setzero_si128 (void)
+{
+ return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi32_pd (__m128i __A)
+{
+ return (__m128d)__builtin_ia32_cvtdq2pd ((__v4si) __A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi32_ps (__m128i __A)
+{
+ return (__m128)__builtin_ia32_cvtdq2ps ((__v4si) __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpd_epi32 (__m128d __A)
+{
+ return (__m128i)__builtin_ia32_cvtpd2dq ((__v2df) __A);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpd_pi32 (__m128d __A)
+{
+ return (__m64)__builtin_ia32_cvtpd2pi ((__v2df) __A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpd_ps (__m128d __A)
+{
+ return (__m128)__builtin_ia32_cvtpd2ps ((__v2df) __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttpd_epi32 (__m128d __A)
+{
+ return (__m128i)__builtin_ia32_cvttpd2dq ((__v2df) __A);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttpd_pi32 (__m128d __A)
+{
+ return (__m64)__builtin_ia32_cvttpd2pi ((__v2df) __A);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpi32_pd (__m64 __A)
+{
+ return (__m128d)__builtin_ia32_cvtpi2pd ((__v2si) __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_epi32 (__m128 __A)
+{
+ return (__m128i)__builtin_ia32_cvtps2dq ((__v4sf) __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttps_epi32 (__m128 __A)
+{
+ return (__m128i)__builtin_ia32_cvttps2dq ((__v4sf) __A);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_pd (__m128 __A)
+{
+ return (__m128d)__builtin_ia32_cvtps2pd ((__v4sf) __A);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_si32 (__m128d __A)
+{
+ return __builtin_ia32_cvtsd2si ((__v2df) __A);
+}
+
+#ifdef __x86_64__
+/* Intel intrinsic. */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_si64 (__m128d __A)
+{
+ return __builtin_ia32_cvtsd2si64 ((__v2df) __A);
+}
+
+/* Microsoft intrinsic. */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_si64x (__m128d __A)
+{
+ return __builtin_ia32_cvtsd2si64 ((__v2df) __A);
+}
+#endif
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsd_si32 (__m128d __A)
+{
+ return __builtin_ia32_cvttsd2si ((__v2df) __A);
+}
+
+#ifdef __x86_64__
+/* Intel intrinsic. */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsd_si64 (__m128d __A)
+{
+ return __builtin_ia32_cvttsd2si64 ((__v2df) __A);
+}
+
+/* Microsoft intrinsic. */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsd_si64x (__m128d __A)
+{
+ return __builtin_ia32_cvttsd2si64 ((__v2df) __A);
+}
+#endif
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_ss (__m128 __A, __m128d __B)
+{
+ return (__m128)__builtin_ia32_cvtsd2ss ((__v4sf) __A, (__v2df) __B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi32_sd (__m128d __A, int __B)
+{
+ return (__m128d)__builtin_ia32_cvtsi2sd ((__v2df) __A, __B);
+}
+
+#ifdef __x86_64__
+/* Intel intrinsic. */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64_sd (__m128d __A, long long __B)
+{
+ return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B);
+}
+
+/* Microsoft intrinsic. */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64x_sd (__m128d __A, long long __B)
+{
+ return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B);
+}
+#endif
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_sd (__m128d __A, __m128 __B)
+{
+ return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
+{
+ return (__m128d)__builtin_ia32_shufpd ((__v2df)__A, (__v2df)__B, __mask);
+}
+#else
+#define _mm_shuffle_pd(A, B, N) \
+ ((__m128d)__builtin_ia32_shufpd ((__v2df)(__m128d)(A), \
+ (__v2df)(__m128d)(B), (int)(N)))
+#endif
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_pd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_pd (__m128d __A, __m128d __B)
+{
+ return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadh_pd (__m128d __A, double const *__B)
+{
+ return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, __B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadl_pd (__m128d __A, double const *__B)
+{
+ return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, __B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movemask_pd (__m128d __A)
+{
+ return __builtin_ia32_movmskpd ((__v2df)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packs_epi16 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_packsswb128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packs_epi32 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_packssdw128 ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packus_epi16 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_packuswb128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_epi8 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_punpckhbw128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_epi16 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_punpckhwd128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_epi32 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_punpckhdq128 ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_epi64 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_punpckhqdq128 ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_epi8 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_punpcklbw128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_epi16 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_punpcklwd128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_epi32 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_punpckldq128 ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_epi64 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_epi8 (__m128i __A, __m128i __B)
+{
+ return (__m128i) ((__v16qu)__A + (__v16qu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_epi16 (__m128i __A, __m128i __B)
+{
+ return (__m128i) ((__v8hu)__A + (__v8hu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_epi32 (__m128i __A, __m128i __B)
+{
+ return (__m128i) ((__v4su)__A + (__v4su)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_epi64 (__m128i __A, __m128i __B)
+{
+ return (__m128i) ((__v2du)__A + (__v2du)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_epi8 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_epi16 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_paddsw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_epu8 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_paddusb128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_epu16 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_epi8 (__m128i __A, __m128i __B)
+{
+ return (__m128i) ((__v16qu)__A - (__v16qu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_epi16 (__m128i __A, __m128i __B)
+{
+ return (__m128i) ((__v8hu)__A - (__v8hu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_epi32 (__m128i __A, __m128i __B)
+{
+ return (__m128i) ((__v4su)__A - (__v4su)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_epi64 (__m128i __A, __m128i __B)
+{
+ return (__m128i) ((__v2du)__A - (__v2du)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_subs_epi8 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_subs_epi16 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_psubsw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_subs_epu8 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_psubusb128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_subs_epu16 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_psubusw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_madd_epi16 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_pmaddwd128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhi_epi16 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mullo_epi16 (__m128i __A, __m128i __B)
+{
+ return (__m128i) ((__v8hu)__A * (__v8hu)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_su32 (__m64 __A, __m64 __B)
+{
+ return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_epu32 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_epi16 (__m128i __A, int __B)
+{
+ return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_epi32 (__m128i __A, int __B)
+{
+ return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_epi64 (__m128i __A, int __B)
+{
+ return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srai_epi16 (__m128i __A, int __B)
+{
+ return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srai_epi32 (__m128i __A, int __B)
+{
+ return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_bsrli_si128 (__m128i __A, const int __N)
+{
+ return (__m128i)__builtin_ia32_psrldqi128 (__A, __N * 8);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_bslli_si128 (__m128i __A, const int __N)
+{
+ return (__m128i)__builtin_ia32_pslldqi128 (__A, __N * 8);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_si128 (__m128i __A, const int __N)
+{
+ return (__m128i)__builtin_ia32_psrldqi128 (__A, __N * 8);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_si128 (__m128i __A, const int __N)
+{
+ return (__m128i)__builtin_ia32_pslldqi128 (__A, __N * 8);
+}
+#else
+#define _mm_bsrli_si128(A, N) \
+ ((__m128i)__builtin_ia32_psrldqi128 ((__m128i)(A), (int)(N) * 8))
+#define _mm_bslli_si128(A, N) \
+ ((__m128i)__builtin_ia32_pslldqi128 ((__m128i)(A), (int)(N) * 8))
+#define _mm_srli_si128(A, N) \
+ ((__m128i)__builtin_ia32_psrldqi128 ((__m128i)(A), (int)(N) * 8))
+#define _mm_slli_si128(A, N) \
+ ((__m128i)__builtin_ia32_pslldqi128 ((__m128i)(A), (int)(N) * 8))
+#endif
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_epi16 (__m128i __A, int __B)
+{
+ return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_epi32 (__m128i __A, int __B)
+{
+ return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_epi64 (__m128i __A, int __B)
+{
+ return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_epi16 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_psllw128((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_epi32 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_pslld128((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_epi64 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_psllq128((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sra_epi16 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_psraw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sra_epi32 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_psrad128 ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_epi16 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_psrlw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_epi32 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_psrld128 ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_epi64 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_psrlq128 ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_and_si128 (__m128i __A, __m128i __B)
+{
+ return (__m128i) ((__v2du)__A & (__v2du)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_andnot_si128 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_pandn128 ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_or_si128 (__m128i __A, __m128i __B)
+{
+ return (__m128i) ((__v2du)__A | (__v2du)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_xor_si128 (__m128i __A, __m128i __B)
+{
+ return (__m128i) ((__v2du)__A ^ (__v2du)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi8 (__m128i __A, __m128i __B)
+{
+ return (__m128i) ((__v16qi)__A == (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi16 (__m128i __A, __m128i __B)
+{
+ return (__m128i) ((__v8hi)__A == (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi32 (__m128i __A, __m128i __B)
+{
+ return (__m128i) ((__v4si)__A == (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epi8 (__m128i __A, __m128i __B)
+{
+ return (__m128i) ((__v16qs)__A < (__v16qs)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epi16 (__m128i __A, __m128i __B)
+{
+ return (__m128i) ((__v8hi)__A < (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epi32 (__m128i __A, __m128i __B)
+{
+ return (__m128i) ((__v4si)__A < (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi8 (__m128i __A, __m128i __B)
+{
+ return (__m128i) ((__v16qs)__A > (__v16qs)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi16 (__m128i __A, __m128i __B)
+{
+ return (__m128i) ((__v8hi)__A > (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi32 (__m128i __A, __m128i __B)
+{
+ return (__m128i) ((__v4si)__A > (__v4si)__B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_epi16 (__m128i const __A, int const __N)
+{
+ return (unsigned short) __builtin_ia32_vec_ext_v8hi ((__v8hi)__A, __N);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
+{
+ return (__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)__A, __D, __N);
+}
+#else
+#define _mm_extract_epi16(A, N) \
+ ((int) (unsigned short) __builtin_ia32_vec_ext_v8hi ((__v8hi)(__m128i)(A), (int)(N)))
+#define _mm_insert_epi16(A, D, N) \
+ ((__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)(__m128i)(A), \
+ (int)(D), (int)(N)))
+#endif
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epi16 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_pmaxsw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epu8 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_pmaxub128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epi16 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_pminsw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epu8 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_pminub128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movemask_epi8 (__m128i __A)
+{
+ return __builtin_ia32_pmovmskb128 ((__v16qi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhi_epu16 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_pmulhuw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shufflehi_epi16 (__m128i __A, const int __mask)
+{
+ return (__m128i)__builtin_ia32_pshufhw ((__v8hi)__A, __mask);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shufflelo_epi16 (__m128i __A, const int __mask)
+{
+ return (__m128i)__builtin_ia32_pshuflw ((__v8hi)__A, __mask);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_epi32 (__m128i __A, const int __mask)
+{
+ return (__m128i)__builtin_ia32_pshufd ((__v4si)__A, __mask);
+}
+#else
+#define _mm_shufflehi_epi16(A, N) \
+ ((__m128i)__builtin_ia32_pshufhw ((__v8hi)(__m128i)(A), (int)(N)))
+#define _mm_shufflelo_epi16(A, N) \
+ ((__m128i)__builtin_ia32_pshuflw ((__v8hi)(__m128i)(A), (int)(N)))
+#define _mm_shuffle_epi32(A, N) \
+ ((__m128i)__builtin_ia32_pshufd ((__v4si)(__m128i)(A), (int)(N)))
+#endif
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
+{
+ __builtin_ia32_maskmovdqu ((__v16qi)__A, (__v16qi)__B, __C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avg_epu8 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_pavgb128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avg_epu16 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_pavgw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sad_epu8 (__m128i __A, __m128i __B)
+{
+ return (__m128i)__builtin_ia32_psadbw128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_si32 (int *__A, int __B)
+{
+ __builtin_ia32_movnti (__A, __B);
+}
+
+#ifdef __x86_64__
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_si64 (long long int *__A, long long int __B)
+{
+ __builtin_ia32_movnti64 (__A, __B);
+}
+#endif
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_si128 (__m128i *__A, __m128i __B)
+{
+ __builtin_ia32_movntdq ((__v2di *)__A, (__v2di)__B);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_pd (double *__A, __m128d __B)
+{
+ __builtin_ia32_movntpd (__A, (__v2df)__B);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_clflush (void const *__A)
+{
+ __builtin_ia32_clflush (__A);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_lfence (void)
+{
+ __builtin_ia32_lfence ();
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mfence (void)
+{
+ __builtin_ia32_mfence ();
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi32_si128 (int __A)
+{
+ return _mm_set_epi32 (0, 0, 0, __A);
+}
+
+#ifdef __x86_64__
+/* Intel intrinsic. */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64_si128 (long long __A)
+{
+ return _mm_set_epi64x (0, __A);
+}
+
+/* Microsoft intrinsic. */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64x_si128 (long long __A)
+{
+ return _mm_set_epi64x (0, __A);
+}
+#endif
+
+/* Casts between various SP, DP, INT vector types. Note that these do no
+ conversion of values, they just change the type. */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castpd_ps(__m128d __A)
+{
+ return (__m128) __A;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castpd_si128(__m128d __A)
+{
+ return (__m128i) __A;
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castps_pd(__m128 __A)
+{
+ return (__m128d) __A;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castps_si128(__m128 __A)
+{
+ return (__m128i) __A;
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castsi128_ps(__m128i __A)
+{
+ return (__m128) __A;
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castsi128_pd(__m128i __A)
+{
+ return (__m128d) __A;
+}
+
+#ifdef __DISABLE_SSE2__
+#undef __DISABLE_SSE2__
+#pragma GCC pop_options
+#endif /* __DISABLE_SSE2__ */
+
+#endif /* _EMMINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2019-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <enqcmdintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _ENQCMDINTRIN_H_INCLUDED
+#define _ENQCMDINTRIN_H_INCLUDED
+
+#ifndef __ENQCMD__
+#pragma GCC push_options
+#pragma GCC target ("enqcmd")
+#define __DISABLE_ENQCMD__
+#endif /* __ENQCMD__ */
+
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_enqcmd (void * __P, const void * __Q)
+{
+ return __builtin_ia32_enqcmd (__P, __Q);
+}
+
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_enqcmds (void * __P, const void * __Q)
+{
+ return __builtin_ia32_enqcmds (__P, __Q);
+}
+
+#ifdef __DISABLE_ENQCMD__
+#undef __DISABLE_ENQCMD__
+#pragma GCC pop_options
+#endif /* __DISABLE_ENQCMD__ */
+#endif /* _ENQCMDINTRIN_H_INCLUDED. */
--- /dev/null
+/* Copyright (C) 2011-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED
+# error "Never use <f16intrin.h> directly; include <x86intrin.h> or <immintrin.h> instead."
+#endif
+
+#ifndef _F16CINTRIN_H_INCLUDED
+#define _F16CINTRIN_H_INCLUDED
+
+#ifndef __F16C__
+#pragma GCC push_options
+#pragma GCC target("f16c")
+#define __DISABLE_F16C__
+#endif /* __F16C__ */
+
+extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_cvtsh_ss (unsigned short __S)
+{
+ __v8hi __H = __extension__ (__v8hi){ (short) __S, 0, 0, 0, 0, 0, 0, 0 };
+ __v4sf __A = __builtin_ia32_vcvtph2ps (__H);
+ return __builtin_ia32_vec_ext_v4sf (__A, 0);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtph_ps (__m128i __A)
+{
+ return (__m128) __builtin_ia32_vcvtph2ps ((__v8hi) __A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtph_ps (__m128i __A)
+{
+ return (__m256) __builtin_ia32_vcvtph2ps256 ((__v8hi) __A);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_cvtss_sh (float __F, const int __I)
+{
+ __v4sf __A = __extension__ (__v4sf){ __F, 0, 0, 0 };
+ __v8hi __H = __builtin_ia32_vcvtps2ph (__A, __I);
+ return (unsigned short) __builtin_ia32_vec_ext_v8hi (__H, 0);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_ph (__m128 __A, const int __I)
+{
+ return (__m128i) __builtin_ia32_vcvtps2ph ((__v4sf) __A, __I);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtps_ph (__m256 __A, const int __I)
+{
+ return (__m128i) __builtin_ia32_vcvtps2ph256 ((__v8sf) __A, __I);
+}
+#else
+#define _cvtss_sh(__F, __I) \
+ (__extension__ \
+ ({ \
+ __v4sf __A = __extension__ (__v4sf){ __F, 0, 0, 0 }; \
+ __v8hi __H = __builtin_ia32_vcvtps2ph (__A, __I); \
+ (unsigned short) __builtin_ia32_vec_ext_v8hi (__H, 0); \
+ }))
+
+#define _mm_cvtps_ph(A, I) \
+ ((__m128i) __builtin_ia32_vcvtps2ph ((__v4sf)(__m128) (A), (int) (I)))
+
+#define _mm256_cvtps_ph(A, I) \
+ ((__m128i) __builtin_ia32_vcvtps2ph256 ((__v8sf)(__m256) (A), (int) (I)))
+#endif /* __OPTIMIZE */
+
+#ifdef __DISABLE_F16C__
+#undef __DISABLE_F16C__
+#pragma GCC pop_options
+#endif /* __DISABLE_F16C__ */
+
+#endif /* _F16CINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2007-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86INTRIN_H_INCLUDED
+# error "Never use <fma4intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _FMA4INTRIN_H_INCLUDED
+#define _FMA4INTRIN_H_INCLUDED
+
+/* We need definitions from the SSE4A, SSE3, SSE2 and SSE header files. */
+#include <ammintrin.h>
+
+#ifndef __FMA4__
+#pragma GCC push_options
+#pragma GCC target("fma4")
+#define __DISABLE_FMA4__
+#endif /* __FMA4__ */
+
+/* 128b Floating point multiply/add type instructions. */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_macc_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmaddps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_macc_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmaddpd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_macc_ss (__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmaddss ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_macc_sd (__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_msub_ps (__m128 __A, __m128 __B, __m128 __C)
+
+{
+ return (__m128) __builtin_ia32_vfmaddps ((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_msub_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmaddpd ((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_msub_ss (__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmaddss ((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_msub_sd (__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd ((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmacc_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmaddps (-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmacc_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmaddpd (-(__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmacc_ss (__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmaddss (-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmacc_sd (__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd (-(__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmsub_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmaddps (-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmsub_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmaddpd (-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmsub_ss (__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmaddss (-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmsub_sd (__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd (-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maddsub_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmaddsubps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maddsub_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmaddsubpd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_msubadd_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmaddsubps ((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_msubadd_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmaddsubpd ((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+/* 256b Floating point multiply/add type instructions. */
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_macc_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256) __builtin_ia32_vfmaddps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_macc_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d) __builtin_ia32_vfmaddpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_msub_ps (__m256 __A, __m256 __B, __m256 __C)
+
+{
+ return (__m256) __builtin_ia32_vfmaddps256 ((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_msub_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d) __builtin_ia32_vfmaddpd256 ((__v4df)__A, (__v4df)__B, -(__v4df)__C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_nmacc_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256) __builtin_ia32_vfmaddps256 (-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_nmacc_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d) __builtin_ia32_vfmaddpd256 (-(__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_nmsub_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256) __builtin_ia32_vfmaddps256 (-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_nmsub_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d) __builtin_ia32_vfmaddpd256 (-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maddsub_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256) __builtin_ia32_vfmaddsubps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maddsub_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d) __builtin_ia32_vfmaddsubpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_msubadd_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256) __builtin_ia32_vfmaddsubps256 ((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_msubadd_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d) __builtin_ia32_vfmaddsubpd256 ((__v4df)__A, (__v4df)__B, -(__v4df)__C);
+}
+
+#ifdef __DISABLE_FMA4__
+#undef __DISABLE_FMA4__
+#pragma GCC pop_options
+#endif /* __DISABLE_FMA4__ */
+
+#endif
--- /dev/null
+/* Copyright (C) 2011-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+# error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _FMAINTRIN_H_INCLUDED
+#define _FMAINTRIN_H_INCLUDED
+
+#ifndef __FMA__
+#pragma GCC push_options
+#pragma GCC target("fma")
+#define __DISABLE_FMA__
+#endif /* __FMA__ */
+
+extern __inline __m128d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d)__builtin_ia32_vfmaddpd ((__v2df)__A, (__v2df)__B,
+ (__v2df)__C);
+}
+
+extern __inline __m256d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmadd_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d)__builtin_ia32_vfmaddpd256 ((__v4df)__A, (__v4df)__B,
+ (__v4df)__C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128)__builtin_ia32_vfmaddps ((__v4sf)__A, (__v4sf)__B,
+ (__v4sf)__C);
+}
+
+extern __inline __m256
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmadd_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256)__builtin_ia32_vfmaddps256 ((__v8sf)__A, (__v8sf)__B,
+ (__v8sf)__C);
+}
+
+extern __inline __m128d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_sd (__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d) __builtin_ia32_vfmaddsd3 ((__v2df)__A, (__v2df)__B,
+ (__v2df)__C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_ss (__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128) __builtin_ia32_vfmaddss3 ((__v4sf)__A, (__v4sf)__B,
+ (__v4sf)__C);
+}
+
+extern __inline __m128d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmsub_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d)__builtin_ia32_vfmsubpd ((__v2df)__A, (__v2df)__B,
+ (__v2df)__C);
+}
+
+extern __inline __m256d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmsub_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d)__builtin_ia32_vfmsubpd256 ((__v4df)__A, (__v4df)__B,
+ (__v4df)__C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmsub_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128)__builtin_ia32_vfmsubps ((__v4sf)__A, (__v4sf)__B,
+ (__v4sf)__C);
+}
+
+extern __inline __m256
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmsub_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256)__builtin_ia32_vfmsubps256 ((__v8sf)__A, (__v8sf)__B,
+ (__v8sf)__C);
+}
+
+extern __inline __m128d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmsub_sd (__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d)__builtin_ia32_vfmsubsd3 ((__v2df)__A, (__v2df)__B,
+ (__v2df)__C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmsub_ss (__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128)__builtin_ia32_vfmsubss3 ((__v4sf)__A, (__v4sf)__B,
+ (__v4sf)__C);
+}
+
+extern __inline __m128d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmadd_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d)__builtin_ia32_vfnmaddpd ((__v2df)__A, (__v2df)__B,
+ (__v2df)__C);
+}
+
+extern __inline __m256d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fnmadd_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d)__builtin_ia32_vfnmaddpd256 ((__v4df)__A, (__v4df)__B,
+ (__v4df)__C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmadd_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128)__builtin_ia32_vfnmaddps ((__v4sf)__A, (__v4sf)__B,
+ (__v4sf)__C);
+}
+
+extern __inline __m256
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fnmadd_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256)__builtin_ia32_vfnmaddps256 ((__v8sf)__A, (__v8sf)__B,
+ (__v8sf)__C);
+}
+
+extern __inline __m128d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmadd_sd (__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d)__builtin_ia32_vfnmaddsd3 ((__v2df)__A, (__v2df)__B,
+ (__v2df)__C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmadd_ss (__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128)__builtin_ia32_vfnmaddss3 ((__v4sf)__A, (__v4sf)__B,
+ (__v4sf)__C);
+}
+
+extern __inline __m128d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmsub_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d)__builtin_ia32_vfnmsubpd ((__v2df)__A, (__v2df)__B,
+ (__v2df)__C);
+}
+
+extern __inline __m256d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fnmsub_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d)__builtin_ia32_vfnmsubpd256 ((__v4df)__A, (__v4df)__B,
+ (__v4df)__C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmsub_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128)__builtin_ia32_vfnmsubps ((__v4sf)__A, (__v4sf)__B,
+ (__v4sf)__C);
+}
+
+extern __inline __m256
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fnmsub_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256)__builtin_ia32_vfnmsubps256 ((__v8sf)__A, (__v8sf)__B,
+ (__v8sf)__C);
+}
+
+extern __inline __m128d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmsub_sd (__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d)__builtin_ia32_vfnmsubsd3 ((__v2df)__A, (__v2df)__B,
+ (__v2df)__C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmsub_ss (__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128)__builtin_ia32_vfnmsubss3 ((__v4sf)__A, (__v4sf)__B,
+ (__v4sf)__C);
+}
+
+extern __inline __m128d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmaddsub_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d)__builtin_ia32_vfmaddsubpd ((__v2df)__A, (__v2df)__B,
+ (__v2df)__C);
+}
+
+extern __inline __m256d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmaddsub_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d)__builtin_ia32_vfmaddsubpd256 ((__v4df)__A,
+ (__v4df)__B,
+ (__v4df)__C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmaddsub_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128)__builtin_ia32_vfmaddsubps ((__v4sf)__A, (__v4sf)__B,
+ (__v4sf)__C);
+}
+
+extern __inline __m256
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmaddsub_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256)__builtin_ia32_vfmaddsubps256 ((__v8sf)__A,
+ (__v8sf)__B,
+ (__v8sf)__C);
+}
+
+extern __inline __m128d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmsubadd_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+ return (__m128d)__builtin_ia32_vfmaddsubpd ((__v2df)__A, (__v2df)__B,
+ -(__v2df)__C);
+}
+
+extern __inline __m256d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmsubadd_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+ return (__m256d)__builtin_ia32_vfmaddsubpd256 ((__v4df)__A,
+ (__v4df)__B,
+ -(__v4df)__C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmsubadd_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+ return (__m128)__builtin_ia32_vfmaddsubps ((__v4sf)__A, (__v4sf)__B,
+ -(__v4sf)__C);
+}
+
+extern __inline __m256
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmsubadd_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+ return (__m256)__builtin_ia32_vfmaddsubps256 ((__v8sf)__A,
+ (__v8sf)__B,
+ -(__v8sf)__C);
+}
+
+#ifdef __DISABLE_FMA__
+#undef __DISABLE_FMA__
+#pragma GCC pop_options
+#endif /* __DISABLE_FMA__ */
+
+#endif
--- /dev/null
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <fxsrintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _FXSRINTRIN_H_INCLUDED
+#define _FXSRINTRIN_H_INCLUDED
+
+#ifndef __FXSR__
+#pragma GCC push_options
+#pragma GCC target("fxsr")
+#define __DISABLE_FXSR__
+#endif /* __FXSR__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_fxsave (void *__P)
+{
+ __builtin_ia32_fxsave (__P);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_fxrstor (void *__P)
+{
+ __builtin_ia32_fxrstor (__P);
+}
+
+#ifdef __x86_64__
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_fxsave64 (void *__P)
+{
+ __builtin_ia32_fxsave64 (__P);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_fxrstor64 (void *__P)
+{
+ __builtin_ia32_fxrstor64 (__P);
+}
+#endif
+
+#ifdef __DISABLE_FXSR__
+#undef __DISABLE_FXSR__
+#pragma GCC pop_options
+#endif /* __DISABLE_FXSR__ */
+
+
+#endif /* _FXSRINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2017-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <gfniintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _GFNIINTRIN_H_INCLUDED
+#define _GFNIINTRIN_H_INCLUDED
+
+#if !defined(__GFNI__) || !defined(__SSE2__)
+#pragma GCC push_options
+#pragma GCC target("gfni,sse2")
+#define __DISABLE_GFNI__
+#endif /* __GFNI__ */
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_gf2p8mul_epi8 (__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi((__v16qi) __A,
+ (__v16qi) __B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_gf2p8affineinv_epi64_epi8 (__m128i __A, __m128i __B, const int __C)
+{
+ return (__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi ((__v16qi) __A,
+ (__v16qi) __B,
+ __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_gf2p8affine_epi64_epi8 (__m128i __A, __m128i __B, const int __C)
+{
+ return (__m128i) __builtin_ia32_vgf2p8affineqb_v16qi ((__v16qi) __A,
+ (__v16qi) __B, __C);
+}
+#else
+#define _mm_gf2p8affineinv_epi64_epi8(A, B, C) \
+ ((__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \
+ (__v16qi)(__m128i)(B), (int)(C)))
+#define _mm_gf2p8affine_epi64_epi8(A, B, C) \
+ ((__m128i) __builtin_ia32_vgf2p8affineqb_v16qi ((__v16qi)(__m128i)(A), \
+ (__v16qi)(__m128i)(B), (int)(C)))
+#endif
+
+#ifdef __DISABLE_GFNI__
+#undef __DISABLE_GFNI__
+#pragma GCC pop_options
+#endif /* __DISABLE_GFNI__ */
+
+#if !defined(__GFNI__) || !defined(__AVX__)
+#pragma GCC push_options
+#pragma GCC target("gfni,avx")
+#define __DISABLE_GFNIAVX__
+#endif /* __GFNIAVX__ */
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_gf2p8mul_epi8 (__m256i __A, __m256i __B)
+{
+ return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi ((__v32qi) __A,
+ (__v32qi) __B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_gf2p8affineinv_epi64_epi8 (__m256i __A, __m256i __B, const int __C)
+{
+ return (__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi ((__v32qi) __A,
+ (__v32qi) __B,
+ __C);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_gf2p8affine_epi64_epi8 (__m256i __A, __m256i __B, const int __C)
+{
+ return (__m256i) __builtin_ia32_vgf2p8affineqb_v32qi ((__v32qi) __A,
+ (__v32qi) __B, __C);
+}
+#else
+#define _mm256_gf2p8affineinv_epi64_epi8(A, B, C) \
+ ((__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), \
+ (__v32qi)(__m256i)(B), \
+ (int)(C)))
+#define _mm256_gf2p8affine_epi64_epi8(A, B, C) \
+ ((__m256i) __builtin_ia32_vgf2p8affineqb_v32qi ((__v32qi)(__m256i)(A), \
+ ( __v32qi)(__m256i)(B), (int)(C)))
+#endif
+
+#ifdef __DISABLE_GFNIAVX__
+#undef __DISABLE_GFNIAVX__
+#pragma GCC pop_options
+#endif /* __GFNIAVX__ */
+
+#if !defined(__GFNI__) || !defined(__AVX512VL__)
+#pragma GCC push_options
+#pragma GCC target("gfni,avx512vl")
+#define __DISABLE_GFNIAVX512VL__
+#endif /* __GFNIAVX512VL__ */
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_gf2p8mul_epi8 (__m128i __A, __mmask16 __B, __m128i __C, __m128i __D)
+{
+ return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi_mask ((__v16qi) __C,
+ (__v16qi) __D,
+ (__v16qi)__A, __B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_gf2p8mul_epi8 (__mmask16 __A, __m128i __B, __m128i __C)
+{
+ return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi_mask ((__v16qi) __B,
+ (__v16qi) __C, (__v16qi) _mm_setzero_si128 (), __A);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_gf2p8affineinv_epi64_epi8 (__m128i __A, __mmask16 __B, __m128i __C,
+ __m128i __D, const int __E)
+{
+ return (__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi_mask ((__v16qi) __C,
+ (__v16qi) __D,
+ __E,
+ (__v16qi)__A,
+ __B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_gf2p8affineinv_epi64_epi8 (__mmask16 __A, __m128i __B, __m128i __C,
+ const int __D)
+{
+ return (__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi_mask ((__v16qi) __B,
+ (__v16qi) __C, __D,
+ (__v16qi) _mm_setzero_si128 (),
+ __A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_gf2p8affine_epi64_epi8 (__m128i __A, __mmask16 __B, __m128i __C,
+ __m128i __D, const int __E)
+{
+ return (__m128i) __builtin_ia32_vgf2p8affineqb_v16qi_mask ((__v16qi) __C,
+ (__v16qi) __D, __E, (__v16qi)__A, __B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_gf2p8affine_epi64_epi8 (__mmask16 __A, __m128i __B, __m128i __C,
+ const int __D)
+{
+ return (__m128i) __builtin_ia32_vgf2p8affineqb_v16qi_mask ((__v16qi) __B,
+ (__v16qi) __C, __D, (__v16qi) _mm_setzero_si128 (), __A);
+}
+#else
+#define _mm_mask_gf2p8affineinv_epi64_epi8(A, B, C, D, E) \
+ ((__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi_mask( \
+ (__v16qi)(__m128i)(C), (__v16qi)(__m128i)(D), \
+ (int)(E), (__v16qi)(__m128i)(A), (__mmask16)(B)))
+#define _mm_maskz_gf2p8affineinv_epi64_epi8(A, B, C, D) \
+ ((__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi_mask( \
+ (__v16qi)(__m128i)(B), (__v16qi)(__m128i)(C), \
+ (int)(D), (__v16qi)(__m128i) _mm_setzero_si128 (), \
+ (__mmask16)(A)))
+#define _mm_mask_gf2p8affine_epi64_epi8(A, B, C, D, E) \
+ ((__m128i) __builtin_ia32_vgf2p8affineqb_v16qi_mask((__v16qi)(__m128i)(C),\
+ (__v16qi)(__m128i)(D), (int)(E), (__v16qi)(__m128i)(A), (__mmask16)(B)))
+#define _mm_maskz_gf2p8affine_epi64_epi8(A, B, C, D) \
+ ((__m128i) __builtin_ia32_vgf2p8affineqb_v16qi_mask((__v16qi)(__m128i)(B),\
+ (__v16qi)(__m128i)(C), (int)(D), \
+ (__v16qi)(__m128i) _mm_setzero_si128 (), (__mmask16)(A)))
+#endif
+
+#ifdef __DISABLE_GFNIAVX512VL__
+#undef __DISABLE_GFNIAVX512VL__
+#pragma GCC pop_options
+#endif /* __GFNIAVX512VL__ */
+
+#if !defined(__GFNI__) || !defined(__AVX512VL__) || !defined(__AVX512BW__)
+#pragma GCC push_options
+#pragma GCC target("gfni,avx512vl,avx512bw")
+#define __DISABLE_GFNIAVX512VLBW__
+#endif /* __GFNIAVX512VLBW__ */
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_gf2p8mul_epi8 (__m256i __A, __mmask32 __B, __m256i __C,
+ __m256i __D)
+{
+ return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi_mask ((__v32qi) __C,
+ (__v32qi) __D,
+ (__v32qi)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_gf2p8mul_epi8 (__mmask32 __A, __m256i __B, __m256i __C)
+{
+ return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi_mask ((__v32qi) __B,
+ (__v32qi) __C, (__v32qi) _mm256_setzero_si256 (), __A);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_gf2p8affineinv_epi64_epi8 (__m256i __A, __mmask32 __B,
+ __m256i __C, __m256i __D, const int __E)
+{
+ return (__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi_mask ((__v32qi) __C,
+ (__v32qi) __D,
+ __E,
+ (__v32qi)__A,
+ __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_gf2p8affineinv_epi64_epi8 (__mmask32 __A, __m256i __B,
+ __m256i __C, const int __D)
+{
+ return (__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi_mask ((__v32qi) __B,
+ (__v32qi) __C, __D,
+ (__v32qi) _mm256_setzero_si256 (), __A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_gf2p8affine_epi64_epi8 (__m256i __A, __mmask32 __B, __m256i __C,
+ __m256i __D, const int __E)
+{
+ return (__m256i) __builtin_ia32_vgf2p8affineqb_v32qi_mask ((__v32qi) __C,
+ (__v32qi) __D,
+ __E,
+ (__v32qi)__A,
+ __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_gf2p8affine_epi64_epi8 (__mmask32 __A, __m256i __B,
+ __m256i __C, const int __D)
+{
+ return (__m256i) __builtin_ia32_vgf2p8affineqb_v32qi_mask ((__v32qi) __B,
+ (__v32qi) __C, __D, (__v32qi)_mm256_setzero_si256 (), __A);
+}
+#else
+#define _mm256_mask_gf2p8affineinv_epi64_epi8(A, B, C, D, E) \
+ ((__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi_mask( \
+ (__v32qi)(__m256i)(C), (__v32qi)(__m256i)(D), (int)(E), \
+ (__v32qi)(__m256i)(A), (__mmask32)(B)))
+#define _mm256_maskz_gf2p8affineinv_epi64_epi8(A, B, C, D) \
+ ((__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi_mask( \
+ (__v32qi)(__m256i)(B), (__v32qi)(__m256i)(C), (int)(D), \
+ (__v32qi)(__m256i) _mm256_setzero_si256 (), (__mmask32)(A)))
+#define _mm256_mask_gf2p8affine_epi64_epi8(A, B, C, D, E) \
+ ((__m256i) __builtin_ia32_vgf2p8affineqb_v32qi_mask((__v32qi)(__m256i)(C),\
+ (__v32qi)(__m256i)(D), (int)(E), (__v32qi)(__m256i)(A), (__mmask32)(B)))
+#define _mm256_maskz_gf2p8affine_epi64_epi8(A, B, C, D) \
+ ((__m256i) __builtin_ia32_vgf2p8affineqb_v32qi_mask((__v32qi)(__m256i)(B),\
+ (__v32qi)(__m256i)(C), (int)(D), \
+ (__v32qi)(__m256i) _mm256_setzero_si256 (), (__mmask32)(A)))
+#endif
+
+#ifdef __DISABLE_GFNIAVX512VLBW__
+#undef __DISABLE_GFNIAVX512VLBW__
+#pragma GCC pop_options
+#endif /* __GFNIAVX512VLBW__ */
+
+#if !defined(__GFNI__) || !defined(__AVX512F__) || !defined(__AVX512BW__)
+#pragma GCC push_options
+#pragma GCC target("gfni,avx512f,avx512bw")
+#define __DISABLE_GFNIAVX512FBW__
+#endif /* __GFNIAVX512FBW__ */
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_gf2p8mul_epi8 (__m512i __A, __mmask64 __B, __m512i __C,
+ __m512i __D)
+{
+ return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi_mask ((__v64qi) __C,
+ (__v64qi) __D, (__v64qi)__A, __B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_gf2p8mul_epi8 (__mmask64 __A, __m512i __B, __m512i __C)
+{
+ return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi_mask ((__v64qi) __B,
+ (__v64qi) __C, (__v64qi) _mm512_setzero_si512 (), __A);
+}
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_gf2p8mul_epi8 (__m512i __A, __m512i __B)
+{
+ return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi ((__v64qi) __A,
+ (__v64qi) __B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_gf2p8affineinv_epi64_epi8 (__m512i __A, __mmask64 __B, __m512i __C,
+ __m512i __D, const int __E)
+{
+ return (__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi_mask ((__v64qi) __C,
+ (__v64qi) __D,
+ __E,
+ (__v64qi)__A,
+ __B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_gf2p8affineinv_epi64_epi8 (__mmask64 __A, __m512i __B,
+ __m512i __C, const int __D)
+{
+ return (__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi_mask ((__v64qi) __B,
+ (__v64qi) __C, __D,
+ (__v64qi) _mm512_setzero_si512 (), __A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_gf2p8affineinv_epi64_epi8 (__m512i __A, __m512i __B, const int __C)
+{
+ return (__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi ((__v64qi) __A,
+ (__v64qi) __B, __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_gf2p8affine_epi64_epi8 (__m512i __A, __mmask64 __B, __m512i __C,
+ __m512i __D, const int __E)
+{
+ return (__m512i) __builtin_ia32_vgf2p8affineqb_v64qi_mask ((__v64qi) __C,
+ (__v64qi) __D, __E, (__v64qi)__A, __B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_gf2p8affine_epi64_epi8 (__mmask64 __A, __m512i __B, __m512i __C,
+ const int __D)
+{
+ return (__m512i) __builtin_ia32_vgf2p8affineqb_v64qi_mask ((__v64qi) __B,
+ (__v64qi) __C, __D, (__v64qi) _mm512_setzero_si512 (), __A);
+}
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_gf2p8affine_epi64_epi8 (__m512i __A, __m512i __B, const int __C)
+{
+ return (__m512i) __builtin_ia32_vgf2p8affineqb_v64qi ((__v64qi) __A,
+ (__v64qi) __B, __C);
+}
+#else
+#define _mm512_mask_gf2p8affineinv_epi64_epi8(A, B, C, D, E) \
+ ((__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi_mask( \
+ (__v64qi)(__m512i)(C), (__v64qi)(__m512i)(D), (int)(E), \
+ (__v64qi)(__m512i)(A), (__mmask64)(B)))
+#define _mm512_maskz_gf2p8affineinv_epi64_epi8(A, B, C, D) \
+ ((__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi_mask( \
+ (__v64qi)(__m512i)(B), (__v64qi)(__m512i)(C), (int)(D), \
+ (__v64qi)(__m512i) _mm512_setzero_si512 (), (__mmask64)(A)))
+#define _mm512_gf2p8affineinv_epi64_epi8(A, B, C) \
+ ((__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi ( \
+ (__v64qi)(__m512i)(A), (__v64qi)(__m512i)(B), (int)(C)))
+#define _mm512_mask_gf2p8affine_epi64_epi8(A, B, C, D, E) \
+ ((__m512i) __builtin_ia32_vgf2p8affineqb_v64qi_mask((__v64qi)(__m512i)(C),\
+ (__v64qi)(__m512i)(D), (int)(E), (__v64qi)(__m512i)(A), (__mmask64)(B)))
+#define _mm512_maskz_gf2p8affine_epi64_epi8(A, B, C, D) \
+ ((__m512i) __builtin_ia32_vgf2p8affineqb_v64qi_mask((__v64qi)(__m512i)(B),\
+ (__v64qi)(__m512i)(C), (int)(D), \
+ (__v64qi)(__m512i) _mm512_setzero_si512 (), (__mmask64)(A)))
+#define _mm512_gf2p8affine_epi64_epi8(A, B, C) \
+ ((__m512i) __builtin_ia32_vgf2p8affineqb_v64qi ((__v64qi)(__m512i)(A), \
+ (__v64qi)(__m512i)(B), (int)(C)))
+#endif
+
+#ifdef __DISABLE_GFNIAVX512FBW__
+#undef __DISABLE_GFNIAVX512FBW__
+#pragma GCC pop_options
+#endif /* __GFNIAVX512FBW__ */
+
+#endif /* _GFNIINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if !defined _X86GPRINTRIN_H_INCLUDED
+# error "Never use <hresetintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _HRESETINTRIN_H_INCLUDED
+#define _HRESETINTRIN_H_INCLUDED
+
+#ifndef __HRESET__
+#pragma GCC push_options
+#pragma GCC target ("hreset")
+#define __DISABLE_HRESET__
+#endif /* __HRESET__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_hreset (unsigned int __EAX)
+{
+ __builtin_ia32_hreset (__EAX);
+}
+
+#ifdef __DISABLE_HRESET__
+#undef __DISABLE_HRESET__
+#pragma GCC pop_options
+#endif /* __DISABLE_HRESET__ */
+#endif /* _HRESETINTRIN_H_INCLUDED. */
--- /dev/null
+/* Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <ia32intrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+/* 32bit bsf */
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bsfd (int __X)
+{
+ return __builtin_ctz (__X);
+}
+
+/* 32bit bsr */
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bsrd (int __X)
+{
+ return __builtin_ia32_bsrsi (__X);
+}
+
+/* 32bit bswap */
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bswapd (int __X)
+{
+ return __builtin_bswap32 (__X);
+}
+
+#ifndef __iamcu__
+
+#ifndef __CRC32__
+#pragma GCC push_options
+#pragma GCC target("crc32")
+#define __DISABLE_CRC32__
+#endif /* __CRC32__ */
+
+/* 32bit accumulate CRC32 (polynomial 0x11EDC6F41) value. */
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__crc32b (unsigned int __C, unsigned char __V)
+{
+ return __builtin_ia32_crc32qi (__C, __V);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__crc32w (unsigned int __C, unsigned short __V)
+{
+ return __builtin_ia32_crc32hi (__C, __V);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__crc32d (unsigned int __C, unsigned int __V)
+{
+ return __builtin_ia32_crc32si (__C, __V);
+}
+
+#ifdef __DISABLE_CRC32__
+#undef __DISABLE_CRC32__
+#pragma GCC pop_options
+#endif /* __DISABLE_CRC32__ */
+
+#endif /* __iamcu__ */
+
+/* 32bit popcnt */
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__popcntd (unsigned int __X)
+{
+ return __builtin_popcount (__X);
+}
+
+#ifndef __iamcu__
+
+/* rdpmc */
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rdpmc (int __S)
+{
+ return __builtin_ia32_rdpmc (__S);
+}
+
+#endif /* __iamcu__ */
+
+/* rdtsc */
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rdtsc (void)
+{
+ return __builtin_ia32_rdtsc ();
+}
+
+#ifndef __iamcu__
+
+/* rdtscp */
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rdtscp (unsigned int *__A)
+{
+ return __builtin_ia32_rdtscp (__A);
+}
+
+#endif /* __iamcu__ */
+
+/* 8bit rol */
+extern __inline unsigned char
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rolb (unsigned char __X, int __C)
+{
+ return __builtin_ia32_rolqi (__X, __C);
+}
+
+/* 16bit rol */
+extern __inline unsigned short
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rolw (unsigned short __X, int __C)
+{
+ return __builtin_ia32_rolhi (__X, __C);
+}
+
+/* 32bit rol */
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rold (unsigned int __X, int __C)
+{
+ __C &= 31;
+ return (__X << __C) | (__X >> (-__C & 31));
+}
+
+/* 8bit ror */
+extern __inline unsigned char
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rorb (unsigned char __X, int __C)
+{
+ return __builtin_ia32_rorqi (__X, __C);
+}
+
+/* 16bit ror */
+extern __inline unsigned short
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rorw (unsigned short __X, int __C)
+{
+ return __builtin_ia32_rorhi (__X, __C);
+}
+
+/* 32bit ror */
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rord (unsigned int __X, int __C)
+{
+ __C &= 31;
+ return (__X >> __C) | (__X << (-__C & 31));
+}
+
+/* Pause */
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__pause (void)
+{
+ __builtin_ia32_pause ();
+}
+
+#ifdef __x86_64__
+/* 64bit bsf */
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bsfq (long long __X)
+{
+ return __builtin_ctzll (__X);
+}
+
+/* 64bit bsr */
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bsrq (long long __X)
+{
+ return __builtin_ia32_bsrdi (__X);
+}
+
+/* 64bit bswap */
+extern __inline long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bswapq (long long __X)
+{
+ return __builtin_bswap64 (__X);
+}
+
+#ifndef __CRC32__
+#pragma GCC push_options
+#pragma GCC target("crc32")
+#define __DISABLE_CRC32__
+#endif /* __CRC32__ */
+
+/* 64bit accumulate CRC32 (polynomial 0x11EDC6F41) value. */
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__crc32q (unsigned long long __C, unsigned long long __V)
+{
+ return __builtin_ia32_crc32di (__C, __V);
+}
+
+#ifdef __DISABLE_CRC32__
+#undef __DISABLE_CRC32__
+#pragma GCC pop_options
+#endif /* __DISABLE_CRC32__ */
+
+/* 64bit popcnt */
+extern __inline long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__popcntq (unsigned long long __X)
+{
+ return __builtin_popcountll (__X);
+}
+
+/* 64bit rol */
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rolq (unsigned long long __X, int __C)
+{
+ __C &= 63;
+ return (__X << __C) | (__X >> (-__C & 63));
+}
+
+/* 64bit ror */
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rorq (unsigned long long __X, int __C)
+{
+ __C &= 63;
+ return (__X >> __C) | (__X << (-__C & 63));
+}
+
+/* Read flags register */
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__readeflags (void)
+{
+ return __builtin_ia32_readeflags_u64 ();
+}
+
+/* Write flags register */
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__writeeflags (unsigned long long __X)
+{
+ __builtin_ia32_writeeflags_u64 (__X);
+}
+
+#define _bswap64(a) __bswapq(a)
+#define _popcnt64(a) __popcntq(a)
+#else
+
+/* Read flags register */
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__readeflags (void)
+{
+ return __builtin_ia32_readeflags_u32 ();
+}
+
+/* Write flags register */
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__writeeflags (unsigned int __X)
+{
+ __builtin_ia32_writeeflags_u32 (__X);
+}
+
+#endif
+
+/* On LP64 systems, longs are 64-bit. Use the appropriate rotate
+ * function. */
+#ifdef __LP64__
+#define _lrotl(a,b) __rolq((a), (b))
+#define _lrotr(a,b) __rorq((a), (b))
+#else
+#define _lrotl(a,b) __rold((a), (b))
+#define _lrotr(a,b) __rord((a), (b))
+#endif
+
+#define _bit_scan_forward(a) __bsfd(a)
+#define _bit_scan_reverse(a) __bsrd(a)
+#define _bswap(a) __bswapd(a)
+#define _popcnt32(a) __popcntd(a)
+#ifndef __iamcu__
+#define _rdpmc(a) __rdpmc(a)
+#define _rdtscp(a) __rdtscp(a)
+#endif /* __iamcu__ */
+#define _rdtsc() __rdtsc()
+#define _rotwl(a,b) __rolw((a), (b))
+#define _rotwr(a,b) __rorw((a), (b))
+#define _rotl(a,b) __rold((a), (b))
+#define _rotr(a,b) __rord((a), (b))
--- /dev/null
+/* Copyright (C) 2008-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#define _IMMINTRIN_H_INCLUDED
+
+#include <x86gprintrin.h>
+
+#include <mmintrin.h>
+
+#include <xmmintrin.h>
+
+#include <emmintrin.h>
+
+#include <pmmintrin.h>
+
+#include <tmmintrin.h>
+
+#include <smmintrin.h>
+
+#include <wmmintrin.h>
+
+#include <avxintrin.h>
+
+#include <avxvnniintrin.h>
+
+#include <avxifmaintrin.h>
+
+#include <avxvnniint8intrin.h>
+
+#include <avx2intrin.h>
+
+#include <avx512fintrin.h>
+
+#include <avx512erintrin.h>
+
+#include <avx512pfintrin.h>
+
+#include <avx512cdintrin.h>
+
+#include <avx512vlintrin.h>
+
+#include <avx512bwintrin.h>
+
+#include <avx512dqintrin.h>
+
+#include <avx512vlbwintrin.h>
+
+#include <avx512vldqintrin.h>
+
+#include <avx512ifmaintrin.h>
+
+#include <avx512ifmavlintrin.h>
+
+#include <avx512vbmiintrin.h>
+
+#include <avx512vbmivlintrin.h>
+
+#include <avx5124fmapsintrin.h>
+
+#include <avx5124vnniwintrin.h>
+
+#include <avx512vpopcntdqintrin.h>
+
+#include <avx512vbmi2intrin.h>
+
+#include <avx512vbmi2vlintrin.h>
+
+#include <avx512vnniintrin.h>
+
+#include <avx512vnnivlintrin.h>
+
+#include <avx512vpopcntdqvlintrin.h>
+
+#include <avx512bitalgintrin.h>
+
+#include <avx512vp2intersectintrin.h>
+
+#include <avx512vp2intersectvlintrin.h>
+
+#ifdef __SSE2__
+#include <avx512fp16intrin.h>
+
+#include <avx512fp16vlintrin.h>
+#endif
+
+#include <shaintrin.h>
+
+#include <fmaintrin.h>
+
+#include <f16cintrin.h>
+
+#include <rtmintrin.h>
+
+#include <gfniintrin.h>
+
+#include <vaesintrin.h>
+
+#include <vpclmulqdqintrin.h>
+
+#ifdef __SSE2__
+#include <avx512bf16vlintrin.h>
+
+#include <avx512bf16intrin.h>
+
+#include <avxneconvertintrin.h>
+#endif
+
+#include <amxtileintrin.h>
+
+#include <amxint8intrin.h>
+
+#include <amxbf16intrin.h>
+
+#include <amxcomplexintrin.h>
+
+#include <prfchwintrin.h>
+
+#include <keylockerintrin.h>
+
+#include <amxfp16intrin.h>
+
+#endif /* _IMMINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2018-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+# error "Never use <keylockerintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _KEYLOCKERINTRIN_H_INCLUDED
+#define _KEYLOCKERINTRIN_H_INCLUDED
+
+#ifndef __KL__
+#pragma GCC push_options
+#pragma GCC target("kl")
+#define __DISABLE_KL__
+#endif /* __KL__ */
+
+
+extern __inline
+void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadiwkey (unsigned int __I, __m128i __A, __m128i __B, __m128i __C)
+{
+ __builtin_ia32_loadiwkey ((__v2di) __B, (__v2di) __C, (__v2di) __A, __I);
+}
+
+extern __inline
+unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_encodekey128_u32 (unsigned int __I, __m128i __A, void * __P)
+{
+ return __builtin_ia32_encodekey128_u32 (__I, (__v2di)__A, __P);
+}
+
+extern __inline
+unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_encodekey256_u32 (unsigned int __I, __m128i __A, __m128i __B, void * __P)
+{
+ return __builtin_ia32_encodekey256_u32 (__I, (__v2di)__A, (__v2di)__B, __P);
+}
+
+extern __inline
+unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesdec128kl_u8 (__m128i * __A, __m128i __B, const void * __P)
+{
+ return __builtin_ia32_aesdec128kl_u8 ((__v2di *) __A, (__v2di) __B, __P);
+}
+
+extern __inline
+unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesdec256kl_u8 (__m128i * __A, __m128i __B, const void * __P)
+{
+ return __builtin_ia32_aesdec256kl_u8 ((__v2di *) __A, (__v2di) __B, __P);
+}
+
+extern __inline
+unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesenc128kl_u8 (__m128i * __A, __m128i __B, const void * __P)
+{
+ return __builtin_ia32_aesenc128kl_u8 ((__v2di *) __A, (__v2di) __B, __P);
+}
+
+extern __inline
+unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesenc256kl_u8 (__m128i * __A, __m128i __B, const void * __P)
+{
+ return __builtin_ia32_aesenc256kl_u8 ((__v2di *) __A, (__v2di) __B, __P);
+}
+
+#ifdef __DISABLE_KL__
+#undef __DISABLE_KL__
+#pragma GCC pop_options
+#endif /* __DISABLE_KL__ */
+
+#ifndef __WIDEKL__
+#pragma GCC push_options
+#pragma GCC target("widekl")
+#define __DISABLE_WIDEKL__
+#endif /* __WIDEKL__ */
+
+extern __inline
+unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesdecwide128kl_u8(__m128i __A[8], const __m128i __B[8], const void * __P)
+{
+ return __builtin_ia32_aesdecwide128kl_u8 ((__v2di *) __A, (__v2di *) __B, __P);
+}
+
+extern __inline
+unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesdecwide256kl_u8(__m128i __A[8], const __m128i __B[8], const void * __P)
+{
+ return __builtin_ia32_aesdecwide256kl_u8 ((__v2di *) __A, (__v2di *) __B, __P);
+}
+
+extern __inline
+unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesencwide128kl_u8(__m128i __A[8], const __m128i __B[8], const void * __P)
+{
+ return __builtin_ia32_aesencwide128kl_u8 ((__v2di *) __A, (__v2di *) __B, __P);
+}
+
+extern __inline
+unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesencwide256kl_u8(__m128i __A[8], const __m128i __B[8], const void * __P)
+{
+ return __builtin_ia32_aesencwide256kl_u8 ((__v2di *) __A, (__v2di *) __B, __P);
+}
+#ifdef __DISABLE_WIDEKL__
+#undef __DISABLE_WIDEKL__
+#pragma GCC pop_options
+#endif /* __DISABLE_WIDEKL__ */
+#endif /* _KEYLOCKERINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2007-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <lwpintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _LWPINTRIN_H_INCLUDED
+#define _LWPINTRIN_H_INCLUDED
+
+#ifndef __LWP__
+#pragma GCC push_options
+#pragma GCC target("lwp")
+#define __DISABLE_LWP__
+#endif /* __LWP__ */
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__llwpcb (void *__pcbAddress)
+{
+ __builtin_ia32_llwpcb (__pcbAddress);
+}
+
+extern __inline void * __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__slwpcb (void)
+{
+ return __builtin_ia32_slwpcb ();
+}
+
+#ifdef __OPTIMIZE__
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__lwpval32 (unsigned int __data2, unsigned int __data1, unsigned int __flags)
+{
+ __builtin_ia32_lwpval32 (__data2, __data1, __flags);
+}
+
+#ifdef __x86_64__
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__lwpval64 (unsigned long long __data2, unsigned int __data1,
+ unsigned int __flags)
+{
+ __builtin_ia32_lwpval64 (__data2, __data1, __flags);
+}
+#endif
+#else
+#define __lwpval32(D2, D1, F) \
+ (__builtin_ia32_lwpval32 ((unsigned int) (D2), (unsigned int) (D1), \
+ (unsigned int) (F)))
+#ifdef __x86_64__
+#define __lwpval64(D2, D1, F) \
+ (__builtin_ia32_lwpval64 ((unsigned long long) (D2), (unsigned int) (D1), \
+ (unsigned int) (F)))
+#endif
+#endif
+
+
+#ifdef __OPTIMIZE__
+extern __inline unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__lwpins32 (unsigned int __data2, unsigned int __data1, unsigned int __flags)
+{
+ return __builtin_ia32_lwpins32 (__data2, __data1, __flags);
+}
+
+#ifdef __x86_64__
+extern __inline unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__lwpins64 (unsigned long long __data2, unsigned int __data1,
+ unsigned int __flags)
+{
+ return __builtin_ia32_lwpins64 (__data2, __data1, __flags);
+}
+#endif
+#else
+#define __lwpins32(D2, D1, F) \
+ (__builtin_ia32_lwpins32 ((unsigned int) (D2), (unsigned int) (D1), \
+ (unsigned int) (F)))
+#ifdef __x86_64__
+#define __lwpins64(D2, D1, F) \
+ (__builtin_ia32_lwpins64 ((unsigned long long) (D2), (unsigned int) (D1), \
+ (unsigned int) (F)))
+#endif
+#endif
+
+#ifdef __DISABLE_LWP__
+#undef __DISABLE_LWP__
+#pragma GCC pop_options
+#endif /* __DISABLE_LWP__ */
+
+#endif /* _LWPINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <lzcntintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+
+#ifndef _LZCNTINTRIN_H_INCLUDED
+#define _LZCNTINTRIN_H_INCLUDED
+
+#ifndef __LZCNT__
+#pragma GCC push_options
+#pragma GCC target("lzcnt")
+#define __DISABLE_LZCNT__
+#endif /* __LZCNT__ */
+
+extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__lzcnt16 (unsigned short __X)
+{
+ return __builtin_ia32_lzcnt_u16 (__X);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__lzcnt32 (unsigned int __X)
+{
+ return __builtin_ia32_lzcnt_u32 (__X);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lzcnt_u32 (unsigned int __X)
+{
+ return __builtin_ia32_lzcnt_u32 (__X);
+}
+
+#ifdef __x86_64__
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__lzcnt64 (unsigned long long __X)
+{
+ return __builtin_ia32_lzcnt_u64 (__X);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lzcnt_u64 (unsigned long long __X)
+{
+ return __builtin_ia32_lzcnt_u64 (__X);
+}
+#endif
+
+#ifdef __DISABLE_LZCNT__
+#undef __DISABLE_LZCNT__
+#pragma GCC pop_options
+#endif /* __DISABLE_LZCNT__ */
+
+#endif /* _LZCNTINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2004-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Implemented from the mm3dnow.h (of supposedly AMD origin) included with
+ MSVC 7.1. */
+
+#ifndef _MM3DNOW_H_INCLUDED
+#define _MM3DNOW_H_INCLUDED
+
+#include <mmintrin.h>
+#include <prfchwintrin.h>
+
+#if defined __x86_64__ && !defined __SSE__ || !defined __3dNOW__
+#pragma GCC push_options
+#ifdef __x86_64__
+#pragma GCC target("sse,3dnow")
+#else
+#pragma GCC target("3dnow")
+#endif
+#define __DISABLE_3dNOW__
+#endif /* __3dNOW__ */
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_femms (void)
+{
+ __builtin_ia32_femms();
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pavgusb (__m64 __A, __m64 __B)
+{
+ return (__m64)__builtin_ia32_pavgusb ((__v8qi)__A, (__v8qi)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pf2id (__m64 __A)
+{
+ return (__m64)__builtin_ia32_pf2id ((__v2sf)__A);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfacc (__m64 __A, __m64 __B)
+{
+ return (__m64)__builtin_ia32_pfacc ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfadd (__m64 __A, __m64 __B)
+{
+ return (__m64)__builtin_ia32_pfadd ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfcmpeq (__m64 __A, __m64 __B)
+{
+ return (__m64)__builtin_ia32_pfcmpeq ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfcmpge (__m64 __A, __m64 __B)
+{
+ return (__m64)__builtin_ia32_pfcmpge ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfcmpgt (__m64 __A, __m64 __B)
+{
+ return (__m64)__builtin_ia32_pfcmpgt ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfmax (__m64 __A, __m64 __B)
+{
+ return (__m64)__builtin_ia32_pfmax ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfmin (__m64 __A, __m64 __B)
+{
+ return (__m64)__builtin_ia32_pfmin ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfmul (__m64 __A, __m64 __B)
+{
+ return (__m64)__builtin_ia32_pfmul ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfrcp (__m64 __A)
+{
+ return (__m64)__builtin_ia32_pfrcp ((__v2sf)__A);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfrcpit1 (__m64 __A, __m64 __B)
+{
+ return (__m64)__builtin_ia32_pfrcpit1 ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfrcpit2 (__m64 __A, __m64 __B)
+{
+ return (__m64)__builtin_ia32_pfrcpit2 ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfrsqrt (__m64 __A)
+{
+ return (__m64)__builtin_ia32_pfrsqrt ((__v2sf)__A);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfrsqit1 (__m64 __A, __m64 __B)
+{
+ return (__m64)__builtin_ia32_pfrsqit1 ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfsub (__m64 __A, __m64 __B)
+{
+ return (__m64)__builtin_ia32_pfsub ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfsubr (__m64 __A, __m64 __B)
+{
+ return (__m64)__builtin_ia32_pfsubr ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pi2fd (__m64 __A)
+{
+ return (__m64)__builtin_ia32_pi2fd ((__v2si)__A);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmulhrw (__m64 __A, __m64 __B)
+{
+ return (__m64)__builtin_ia32_pmulhrw ((__v4hi)__A, (__v4hi)__B);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_prefetch (void *__P)
+{
+ __builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_from_float (float __A)
+{
+ return __extension__ (__m64)(__v2sf){ __A, 0.0f };
+}
+
+extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_to_float (__m64 __A)
+{
+ union { __v2sf v; float a[2]; } __tmp;
+ __tmp.v = (__v2sf)__A;
+ return __tmp.a[0];
+}
+
+#ifdef __DISABLE_3dNOW__
+#undef __DISABLE_3dNOW__
+#pragma GCC pop_options
+#endif /* __DISABLE_3dNOW__ */
+
+#if defined __x86_64__ && !defined __SSE__ || !defined __3dNOW_A__
+#pragma GCC push_options
+#ifdef __x86_64__
+#pragma GCC target("sse,3dnowa")
+#else
+#pragma GCC target("3dnowa")
+#endif
+#define __DISABLE_3dNOW_A__
+#endif /* __3dNOW_A__ */
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pf2iw (__m64 __A)
+{
+ return (__m64)__builtin_ia32_pf2iw ((__v2sf)__A);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfnacc (__m64 __A, __m64 __B)
+{
+ return (__m64)__builtin_ia32_pfnacc ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfpnacc (__m64 __A, __m64 __B)
+{
+ return (__m64)__builtin_ia32_pfpnacc ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pi2fw (__m64 __A)
+{
+ return (__m64)__builtin_ia32_pi2fw ((__v2si)__A);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pswapd (__m64 __A)
+{
+ return (__m64)__builtin_ia32_pswapdsf ((__v2sf)__A);
+}
+
+#ifdef __DISABLE_3dNOW_A__
+#undef __DISABLE_3dNOW_A__
+#pragma GCC pop_options
+#endif /* __DISABLE_3dNOW_A__ */
+
+#endif /* _MM3DNOW_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2004-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _MM_MALLOC_H_INCLUDED
+#define _MM_MALLOC_H_INCLUDED
+
+#include <stdlib.h>
+
+/* We can't depend on <stdlib.h> since the prototype of posix_memalign
+ may not be visible. */
+#ifndef __cplusplus
+extern int posix_memalign (void **, size_t, size_t);
+#else
+extern "C" int posix_memalign (void **, size_t, size_t) throw ();
+#endif
+
+static __inline void *
+_mm_malloc (size_t __size, size_t __alignment)
+{
+ void *__ptr;
+ if (__alignment == 1)
+ return malloc (__size);
+ if (__alignment == 2 || (sizeof (void *) == 8 && __alignment == 4))
+ __alignment = sizeof (void *);
+ if (posix_memalign (&__ptr, __alignment, __size) == 0)
+ return __ptr;
+ else
+ return NULL;
+}
+
+static __inline void
+_mm_free (void *__ptr)
+{
+ free (__ptr);
+}
+
+#endif /* _MM_MALLOC_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2002-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+ User Guide and Reference, version 9.0. */
+
+#ifndef _MMINTRIN_H_INCLUDED
+#define _MMINTRIN_H_INCLUDED
+
+#if defined __x86_64__ && !defined __SSE__ || !defined __MMX__
+#pragma GCC push_options
+#ifdef __MMX_WITH_SSE__
+#pragma GCC target("sse2")
+#elif defined __x86_64__
+#pragma GCC target("sse,mmx")
+#else
+#pragma GCC target("mmx")
+#endif
+#define __DISABLE_MMX__
+#endif /* __MMX__ */
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+ vector types, and their scalar components. */
+typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
+typedef int __m32 __attribute__ ((__vector_size__ (4), __may_alias__));
+typedef short __m16 __attribute__ ((__vector_size__ (2), __may_alias__));
+
+/* Unaligned version of the same type */
+typedef int __m64_u __attribute__ ((__vector_size__ (8), __may_alias__, __aligned__ (1)));
+typedef int __m32_u __attribute__ ((__vector_size__ (4), \
+ __may_alias__, __aligned__ (1)));
+typedef short __m16_u __attribute__ ((__vector_size__ (2), \
+ __may_alias__, __aligned__ (1)));
+
+/* Internal data types for implementing the intrinsics. */
+typedef int __v2si __attribute__ ((__vector_size__ (8)));
+typedef short __v4hi __attribute__ ((__vector_size__ (8)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+typedef long long __v1di __attribute__ ((__vector_size__ (8)));
+typedef float __v2sf __attribute__ ((__vector_size__ (8)));
+
+/* Empty the multimedia state. */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_empty (void)
+{
+ __builtin_ia32_emms ();
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_empty (void)
+{
+ _mm_empty ();
+}
+
+/* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi32_si64 (int __i)
+{
+ return (__m64) __builtin_ia32_vec_init_v2si (__i, 0);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_from_int (int __i)
+{
+ return _mm_cvtsi32_si64 (__i);
+}
+
+#ifdef __x86_64__
+/* Convert I to a __m64 object. */
+
+/* Intel intrinsic. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_from_int64 (long long __i)
+{
+ return (__m64) __i;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64_m64 (long long __i)
+{
+ return (__m64) __i;
+}
+
+/* Microsoft intrinsic. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64x_si64 (long long __i)
+{
+ return (__m64) __i;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pi64x (long long __i)
+{
+ return (__m64) __i;
+}
+#endif
+
+/* Convert the lower 32 bits of the __m64 object into an integer. */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64_si32 (__m64 __i)
+{
+ return __builtin_ia32_vec_ext_v2si ((__v2si)__i, 0);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_to_int (__m64 __i)
+{
+ return _mm_cvtsi64_si32 (__i);
+}
+
+#ifdef __x86_64__
+/* Convert the __m64 object to a 64bit integer. */
+
+/* Intel intrinsic. */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_to_int64 (__m64 __i)
+{
+ return (long long)__i;
+}
+
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtm64_si64 (__m64 __i)
+{
+ return (long long)__i;
+}
+
+/* Microsoft intrinsic. */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64_si64x (__m64 __i)
+{
+ return (long long)__i;
+}
+#endif
+
+/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
+ the result, and the four 16-bit values from M2 into the upper four 8-bit
+ values of the result, all with signed saturation. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packs_pi16 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_packsswb ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_packsswb (__m64 __m1, __m64 __m2)
+{
+ return _mm_packs_pi16 (__m1, __m2);
+}
+
+/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
+ the result, and the two 32-bit values from M2 into the upper two 16-bit
+ values of the result, all with signed saturation. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packs_pi32 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_packssdw ((__v2si)__m1, (__v2si)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_packssdw (__m64 __m1, __m64 __m2)
+{
+ return _mm_packs_pi32 (__m1, __m2);
+}
+
+/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
+ the result, and the four 16-bit values from M2 into the upper four 8-bit
+ values of the result, all with unsigned saturation. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packs_pu16 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_packuswb ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_packuswb (__m64 __m1, __m64 __m2)
+{
+ return _mm_packs_pu16 (__m1, __m2);
+}
+
+/* Interleave the four 8-bit values from the high half of M1 with the four
+ 8-bit values from the high half of M2. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_punpckhbw ((__v8qi)__m1, (__v8qi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_punpckhbw (__m64 __m1, __m64 __m2)
+{
+ return _mm_unpackhi_pi8 (__m1, __m2);
+}
+
+/* Interleave the two 16-bit values from the high half of M1 with the two
+ 16-bit values from the high half of M2. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_punpckhwd ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_punpckhwd (__m64 __m1, __m64 __m2)
+{
+ return _mm_unpackhi_pi16 (__m1, __m2);
+}
+
+/* Interleave the 32-bit value from the high half of M1 with the 32-bit
+ value from the high half of M2. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_punpckhdq ((__v2si)__m1, (__v2si)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_punpckhdq (__m64 __m1, __m64 __m2)
+{
+ return _mm_unpackhi_pi32 (__m1, __m2);
+}
+
+/* Interleave the four 8-bit values from the low half of M1 with the four
+ 8-bit values from the low half of M2. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_punpcklbw ((__v8qi)__m1, (__v8qi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_punpcklbw (__m64 __m1, __m64 __m2)
+{
+ return _mm_unpacklo_pi8 (__m1, __m2);
+}
+
+/* Interleave the two 16-bit values from the low half of M1 with the two
+ 16-bit values from the low half of M2. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_punpcklwd ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_punpcklwd (__m64 __m1, __m64 __m2)
+{
+ return _mm_unpacklo_pi16 (__m1, __m2);
+}
+
+/* Interleave the 32-bit value from the low half of M1 with the 32-bit
+ value from the low half of M2. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_punpckldq ((__v2si)__m1, (__v2si)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_punpckldq (__m64 __m1, __m64 __m2)
+{
+ return _mm_unpacklo_pi32 (__m1, __m2);
+}
+
+/* Add the 8-bit values in M1 to the 8-bit values in M2. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_pi8 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_paddb ((__v8qi)__m1, (__v8qi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_paddb (__m64 __m1, __m64 __m2)
+{
+ return _mm_add_pi8 (__m1, __m2);
+}
+
+/* Add the 16-bit values in M1 to the 16-bit values in M2. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_pi16 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_paddw ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_paddw (__m64 __m1, __m64 __m2)
+{
+ return _mm_add_pi16 (__m1, __m2);
+}
+
+/* Add the 32-bit values in M1 to the 32-bit values in M2. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_pi32 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_paddd ((__v2si)__m1, (__v2si)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_paddd (__m64 __m1, __m64 __m2)
+{
+ return _mm_add_pi32 (__m1, __m2);
+}
+
+/* Add the 64-bit values in M1 to the 64-bit values in M2. */
+#ifndef __SSE2__
+#pragma GCC push_options
+#ifdef __MMX_WITH_SSE__
+#pragma GCC target("sse2")
+#else
+#pragma GCC target("sse2,mmx")
+#endif
+#define __DISABLE_SSE2__
+#endif /* __SSE2__ */
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_si64 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_paddq ((__v1di)__m1, (__v1di)__m2);
+}
+#ifdef __DISABLE_SSE2__
+#undef __DISABLE_SSE2__
+#pragma GCC pop_options
+#endif /* __DISABLE_SSE2__ */
+
+/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
+ saturated arithmetic. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_pi8 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_paddsb ((__v8qi)__m1, (__v8qi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_paddsb (__m64 __m1, __m64 __m2)
+{
+ return _mm_adds_pi8 (__m1, __m2);
+}
+
+/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
+ saturated arithmetic. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_pi16 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_paddsw ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_paddsw (__m64 __m1, __m64 __m2)
+{
+ return _mm_adds_pi16 (__m1, __m2);
+}
+
+/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
+ saturated arithmetic. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_pu8 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_paddusb ((__v8qi)__m1, (__v8qi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_paddusb (__m64 __m1, __m64 __m2)
+{
+ return _mm_adds_pu8 (__m1, __m2);
+}
+
+/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
+ saturated arithmetic. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_pu16 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_paddusw ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_paddusw (__m64 __m1, __m64 __m2)
+{
+ return _mm_adds_pu16 (__m1, __m2);
+}
+
+/* Subtract the 8-bit values in M2 from the 8-bit values in M1. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_pi8 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_psubb ((__v8qi)__m1, (__v8qi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psubb (__m64 __m1, __m64 __m2)
+{
+ return _mm_sub_pi8 (__m1, __m2);
+}
+
+/* Subtract the 16-bit values in M2 from the 16-bit values in M1. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_pi16 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_psubw ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psubw (__m64 __m1, __m64 __m2)
+{
+ return _mm_sub_pi16 (__m1, __m2);
+}
+
+/* Subtract the 32-bit values in M2 from the 32-bit values in M1. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_pi32 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_psubd ((__v2si)__m1, (__v2si)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psubd (__m64 __m1, __m64 __m2)
+{
+ return _mm_sub_pi32 (__m1, __m2);
+}
+
+/* Add the 64-bit values in M1 to the 64-bit values in M2. */
+#ifndef __SSE2__
+#pragma GCC push_options
+#ifdef __MMX_WITH_SSE__
+#pragma GCC target("sse2")
+#else
+#pragma GCC target("sse2,mmx")
+#endif
+#define __DISABLE_SSE2__
+#endif /* __SSE2__ */
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_si64 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_psubq ((__v1di)__m1, (__v1di)__m2);
+}
+#ifdef __DISABLE_SSE2__
+#undef __DISABLE_SSE2__
+#pragma GCC pop_options
+#endif /* __DISABLE_SSE2__ */
+
+/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
+ saturating arithmetic. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_subs_pi8 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_psubsb ((__v8qi)__m1, (__v8qi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psubsb (__m64 __m1, __m64 __m2)
+{
+ return _mm_subs_pi8 (__m1, __m2);
+}
+
+/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
+ signed saturating arithmetic. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_subs_pi16 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_psubsw ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psubsw (__m64 __m1, __m64 __m2)
+{
+ return _mm_subs_pi16 (__m1, __m2);
+}
+
+/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
+ unsigned saturating arithmetic. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_subs_pu8 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_psubusb ((__v8qi)__m1, (__v8qi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psubusb (__m64 __m1, __m64 __m2)
+{
+ return _mm_subs_pu8 (__m1, __m2);
+}
+
+/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
+ unsigned saturating arithmetic. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_subs_pu16 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_psubusw ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psubusw (__m64 __m1, __m64 __m2)
+{
+ return _mm_subs_pu16 (__m1, __m2);
+}
+
+/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
+ four 32-bit intermediate results, which are then summed by pairs to
+ produce two 32-bit results. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_madd_pi16 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_pmaddwd ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmaddwd (__m64 __m1, __m64 __m2)
+{
+ return _mm_madd_pi16 (__m1, __m2);
+}
+
+/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
+ M2 and produce the high 16 bits of the 32-bit results. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_pmulhw ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmulhw (__m64 __m1, __m64 __m2)
+{
+ return _mm_mulhi_pi16 (__m1, __m2);
+}
+
+/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
+ the low 16 bits of the results. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mullo_pi16 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_pmullw ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmullw (__m64 __m1, __m64 __m2)
+{
+ return _mm_mullo_pi16 (__m1, __m2);
+}
+
+/* Shift four 16-bit values in M left by COUNT. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_pi16 (__m64 __m, __m64 __count)
+{
+ return (__m64) __builtin_ia32_psllw ((__v4hi)__m, (__v4hi)__count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psllw (__m64 __m, __m64 __count)
+{
+ return _mm_sll_pi16 (__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_pi16 (__m64 __m, int __count)
+{
+ return (__m64) __builtin_ia32_psllwi ((__v4hi)__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psllwi (__m64 __m, int __count)
+{
+ return _mm_slli_pi16 (__m, __count);
+}
+
+/* Shift two 32-bit values in M left by COUNT. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_pi32 (__m64 __m, __m64 __count)
+{
+ return (__m64) __builtin_ia32_pslld ((__v2si)__m, (__v2si)__count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pslld (__m64 __m, __m64 __count)
+{
+ return _mm_sll_pi32 (__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_pi32 (__m64 __m, int __count)
+{
+ return (__m64) __builtin_ia32_pslldi ((__v2si)__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pslldi (__m64 __m, int __count)
+{
+ return _mm_slli_pi32 (__m, __count);
+}
+
+/* Shift the 64-bit value in M left by COUNT. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_si64 (__m64 __m, __m64 __count)
+{
+ return (__m64) __builtin_ia32_psllq ((__v1di)__m, (__v1di)__count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psllq (__m64 __m, __m64 __count)
+{
+ return _mm_sll_si64 (__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_si64 (__m64 __m, int __count)
+{
+ return (__m64) __builtin_ia32_psllqi ((__v1di)__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psllqi (__m64 __m, int __count)
+{
+ return _mm_slli_si64 (__m, __count);
+}
+
+/* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sra_pi16 (__m64 __m, __m64 __count)
+{
+ return (__m64) __builtin_ia32_psraw ((__v4hi)__m, (__v4hi)__count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psraw (__m64 __m, __m64 __count)
+{
+ return _mm_sra_pi16 (__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srai_pi16 (__m64 __m, int __count)
+{
+ return (__m64) __builtin_ia32_psrawi ((__v4hi)__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psrawi (__m64 __m, int __count)
+{
+ return _mm_srai_pi16 (__m, __count);
+}
+
+/* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sra_pi32 (__m64 __m, __m64 __count)
+{
+ return (__m64) __builtin_ia32_psrad ((__v2si)__m, (__v2si)__count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psrad (__m64 __m, __m64 __count)
+{
+ return _mm_sra_pi32 (__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srai_pi32 (__m64 __m, int __count)
+{
+ return (__m64) __builtin_ia32_psradi ((__v2si)__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psradi (__m64 __m, int __count)
+{
+ return _mm_srai_pi32 (__m, __count);
+}
+
+/* Shift four 16-bit values in M right by COUNT; shift in zeros. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_pi16 (__m64 __m, __m64 __count)
+{
+ return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, (__v4hi)__count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psrlw (__m64 __m, __m64 __count)
+{
+ return _mm_srl_pi16 (__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_pi16 (__m64 __m, int __count)
+{
+ return (__m64) __builtin_ia32_psrlwi ((__v4hi)__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psrlwi (__m64 __m, int __count)
+{
+ return _mm_srli_pi16 (__m, __count);
+}
+
+/* Shift two 32-bit values in M right by COUNT; shift in zeros. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_pi32 (__m64 __m, __m64 __count)
+{
+ return (__m64) __builtin_ia32_psrld ((__v2si)__m, (__v2si)__count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psrld (__m64 __m, __m64 __count)
+{
+ return _mm_srl_pi32 (__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_pi32 (__m64 __m, int __count)
+{
+ return (__m64) __builtin_ia32_psrldi ((__v2si)__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psrldi (__m64 __m, int __count)
+{
+ return _mm_srli_pi32 (__m, __count);
+}
+
+/* Shift the 64-bit value in M left by COUNT; shift in zeros. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_si64 (__m64 __m, __m64 __count)
+{
+ return (__m64) __builtin_ia32_psrlq ((__v1di)__m, (__v1di)__count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psrlq (__m64 __m, __m64 __count)
+{
+ return _mm_srl_si64 (__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_si64 (__m64 __m, int __count)
+{
+ return (__m64) __builtin_ia32_psrlqi ((__v1di)__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psrlqi (__m64 __m, int __count)
+{
+ return _mm_srli_si64 (__m, __count);
+}
+
+/* Bit-wise AND the 64-bit values in M1 and M2. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_and_si64 (__m64 __m1, __m64 __m2)
+{
+ return __builtin_ia32_pand (__m1, __m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pand (__m64 __m1, __m64 __m2)
+{
+ return _mm_and_si64 (__m1, __m2);
+}
+
+/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
+ 64-bit value in M2. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_andnot_si64 (__m64 __m1, __m64 __m2)
+{
+ return __builtin_ia32_pandn (__m1, __m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pandn (__m64 __m1, __m64 __m2)
+{
+ return _mm_andnot_si64 (__m1, __m2);
+}
+
+/* Bit-wise inclusive OR the 64-bit values in M1 and M2. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_or_si64 (__m64 __m1, __m64 __m2)
+{
+ return __builtin_ia32_por (__m1, __m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_por (__m64 __m1, __m64 __m2)
+{
+ return _mm_or_si64 (__m1, __m2);
+}
+
+/* Bit-wise exclusive OR the 64-bit values in M1 and M2. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_xor_si64 (__m64 __m1, __m64 __m2)
+{
+ return __builtin_ia32_pxor (__m1, __m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pxor (__m64 __m1, __m64 __m2)
+{
+ return _mm_xor_si64 (__m1, __m2);
+}
+
+/* Compare eight 8-bit values. The result of the comparison is 0xFF if the
+ test is true and zero if false. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_pcmpeqb ((__v8qi)__m1, (__v8qi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pcmpeqb (__m64 __m1, __m64 __m2)
+{
+ return _mm_cmpeq_pi8 (__m1, __m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_pcmpgtb ((__v8qi)__m1, (__v8qi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pcmpgtb (__m64 __m1, __m64 __m2)
+{
+ return _mm_cmpgt_pi8 (__m1, __m2);
+}
+
+/* Compare four 16-bit values. The result of the comparison is 0xFFFF if
+ the test is true and zero if false. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_pcmpeqw ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pcmpeqw (__m64 __m1, __m64 __m2)
+{
+ return _mm_cmpeq_pi16 (__m1, __m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_pcmpgtw ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pcmpgtw (__m64 __m1, __m64 __m2)
+{
+ return _mm_cmpgt_pi16 (__m1, __m2);
+}
+
+/* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if
+ the test is true and zero if false. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_pcmpeqd ((__v2si)__m1, (__v2si)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pcmpeqd (__m64 __m1, __m64 __m2)
+{
+ return _mm_cmpeq_pi32 (__m1, __m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
+{
+ return (__m64) __builtin_ia32_pcmpgtd ((__v2si)__m1, (__v2si)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pcmpgtd (__m64 __m1, __m64 __m2)
+{
+ return _mm_cmpgt_pi32 (__m1, __m2);
+}
+
+/* Creates a 64-bit zero. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setzero_si64 (void)
+{
+ return (__m64)0LL;
+}
+
+/* Creates a vector of two 32-bit values; I0 is least significant. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pi32 (int __i1, int __i0)
+{
+ return (__m64) __builtin_ia32_vec_init_v2si (__i0, __i1);
+}
+
+/* Creates a vector of four 16-bit values; W0 is least significant. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
+{
+ return (__m64) __builtin_ia32_vec_init_v4hi (__w0, __w1, __w2, __w3);
+}
+
+/* Creates a vector of eight 8-bit values; B0 is least significant. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
+ char __b3, char __b2, char __b1, char __b0)
+{
+ return (__m64) __builtin_ia32_vec_init_v8qi (__b0, __b1, __b2, __b3,
+ __b4, __b5, __b6, __b7);
+}
+
+/* Similar, but with the arguments in reverse order. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_pi32 (int __i0, int __i1)
+{
+ return _mm_set_pi32 (__i1, __i0);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
+{
+ return _mm_set_pi16 (__w3, __w2, __w1, __w0);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
+ char __b4, char __b5, char __b6, char __b7)
+{
+ return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
+}
+
+/* Creates a vector of two 32-bit values, both elements containing I. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_pi32 (int __i)
+{
+ return _mm_set_pi32 (__i, __i);
+}
+
+/* Creates a vector of four 16-bit values, all elements containing W. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_pi16 (short __w)
+{
+ return _mm_set_pi16 (__w, __w, __w, __w);
+}
+
+/* Creates a vector of eight 8-bit values, all elements containing B. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_pi8 (char __b)
+{
+ return _mm_set_pi8 (__b, __b, __b, __b, __b, __b, __b, __b);
+}
+#ifdef __DISABLE_MMX__
+#undef __DISABLE_MMX__
+#pragma GCC pop_options
+#endif /* __DISABLE_MMX__ */
+
+#endif /* _MMINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2018-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <movdirintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _MOVDIRINTRIN_H_INCLUDED
+#define _MOVDIRINTRIN_H_INCLUDED
+
+#ifndef __MOVDIRI__
+#pragma GCC push_options
+#pragma GCC target ("movdiri")
+#define __DISABLE_MOVDIRI__
+#endif /* __MOVDIRI__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_directstoreu_u32 (void * __P, unsigned int __A)
+{
+ __builtin_ia32_directstoreu_u32 ((unsigned int *)__P, __A);
+}
+#ifdef __x86_64__
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_directstoreu_u64 (void * __P, unsigned long long __A)
+{
+ __builtin_ia32_directstoreu_u64 ((unsigned long long *)__P, __A);
+}
+#endif
+
+#ifdef __DISABLE_MOVDIRI__
+#undef __DISABLE_MOVDIRI__
+#pragma GCC pop_options
+#endif /* __DISABLE_MOVDIRI__ */
+
+#ifndef __MOVDIR64B__
+#pragma GCC push_options
+#pragma GCC target ("movdir64b")
+#define __DISABLE_MOVDIR64B__
+#endif /* __MOVDIR64B__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_movdir64b (void * __P, const void * __Q)
+{
+ __builtin_ia32_movdir64b (__P, __Q);
+}
+
+#ifdef __DISABLE_MOVDIR64B__
+#undef __DISABLE_MOVDIR64B__
+#pragma GCC pop_options
+#endif /* __DISABLE_MOVDIR64B__ */
+#endif /* _MOVDIRINTRIN_H_INCLUDED. */
--- /dev/null
+/* Copyright (C) 2021-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _MWAITINTRIN_H_INCLUDED
+#define _MWAITINTRIN_H_INCLUDED
+
+#ifndef __MWAIT__
+#pragma GCC push_options
+#pragma GCC target("mwait")
+#define __DISABLE_MWAIT__
+#endif /* __MWAIT__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_monitor (void const * __P, unsigned int __E, unsigned int __H)
+{
+ __builtin_ia32_monitor (__P, __E, __H);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mwait (unsigned int __E, unsigned int __H)
+{
+ __builtin_ia32_mwait (__E, __H);
+}
+
+#ifdef __DISABLE_MWAIT__
+#undef __DISABLE_MWAIT__
+#pragma GCC pop_options
+#endif /* __DISABLE_MWAIT__ */
+
+#endif /* _MWAITINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _MWAITXINTRIN_H_INCLUDED
+#define _MWAITXINTRIN_H_INCLUDED
+
+#ifndef __MWAITX__
+#pragma GCC push_options
+#pragma GCC target("mwaitx")
+#define __DISABLE_MWAITX__
+#endif /* __MWAITX__ */
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_monitorx (void const * __P, unsigned int __E, unsigned int __H)
+{
+ __builtin_ia32_monitorx (__P, __E, __H);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mwaitx (unsigned int __E, unsigned int __H, unsigned int __C)
+{
+ __builtin_ia32_mwaitx (__E, __H, __C);
+}
+
+#ifdef __DISABLE_MWAITX__
+#undef __DISABLE_MWAITX__
+#pragma GCC pop_options
+#endif /* __DISABLE_MWAITX__ */
+
+#endif /* _MWAITXINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2018-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <pconfigintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _PCONFIGINTRIN_H_INCLUDED
+#define _PCONFIGINTRIN_H_INCLUDED
+
+#ifndef __PCONFIG__
+#pragma GCC push_options
+#pragma GCC target("pconfig")
+#define __DISABLE_PCONFIG__
+#endif /* __PCONFIG__ */
+
+#define __pconfig_b(leaf, b, retval) \
+ __asm__ __volatile__ ("pconfig\n\t" \
+ : "=a" (retval) \
+ : "a" (leaf), "b" (b) \
+ : "cc")
+
+#define __pconfig_generic(leaf, b, c, d, retval) \
+ __asm__ __volatile__ ("pconfig\n\t" \
+ : "=a" (retval), "=b" (b), "=c" (c), "=d" (d) \
+ : "a" (leaf), "b" (b), "c" (c), "d" (d) \
+ : "cc")
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pconfig_u32 (const unsigned int __L, size_t __D[])
+{
+ enum __pconfig_type
+ {
+ __PCONFIG_KEY_PROGRAM = 0x01,
+ };
+
+ unsigned int __R = 0;
+
+ if (!__builtin_constant_p (__L))
+ __pconfig_generic (__L, __D[0], __D[1], __D[2], __R);
+ else switch (__L)
+ {
+ case __PCONFIG_KEY_PROGRAM:
+ __pconfig_b (__L, __D[0], __R);
+ break;
+ default:
+ __pconfig_generic (__L, __D[0], __D[1], __D[2], __R);
+ }
+ return __R;
+}
+
+#ifdef __DISABLE_PCONFIG__
+#undef __DISABLE_PCONFIG__
+#pragma GCC pop_options
+#endif /* __DISABLE_PCONFIG__ */
+
+#endif /* _PCONFIGINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2015-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <pkuintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _PKUINTRIN_H_INCLUDED
+#define _PKUINTRIN_H_INCLUDED
+
+#ifndef __PKU__
+#pragma GCC push_options
+#pragma GCC target("pku")
+#define __DISABLE_PKU__
+#endif /* __PKU__ */
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_rdpkru_u32 (void)
+{
+ return __builtin_ia32_rdpkru ();
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_wrpkru (unsigned int __key)
+{
+ __builtin_ia32_wrpkru (__key);
+}
+
+#ifdef __DISABLE_PKU__
+#undef __DISABLE_PKU__
+#pragma GCC pop_options
+#endif /* __DISABLE_PKU__ */
+
+#endif /* _PKUINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2003-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+ User Guide and Reference, version 9.0. */
+
+#ifndef _PMMINTRIN_H_INCLUDED
+#define _PMMINTRIN_H_INCLUDED
+
+/* We need definitions from the SSE2 and SSE header files*/
+#include <emmintrin.h>
+#include <mwaitintrin.h>
+
+#ifndef __SSE3__
+#pragma GCC push_options
+#pragma GCC target("sse3")
+#define __DISABLE_SSE3__
+#endif /* __SSE3__ */
+
+/* Additional bits in the MXCSR. */
+#define _MM_DENORMALS_ZERO_MASK 0x0040
+#define _MM_DENORMALS_ZERO_ON 0x0040
+#define _MM_DENORMALS_ZERO_OFF 0x0000
+
+#define _MM_SET_DENORMALS_ZERO_MODE(mode) \
+ _mm_setcsr ((_mm_getcsr () & ~_MM_DENORMALS_ZERO_MASK) | (mode))
+#define _MM_GET_DENORMALS_ZERO_MODE() \
+ (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_addsub_ps (__m128 __X, __m128 __Y)
+{
+ return (__m128) __builtin_ia32_addsubps ((__v4sf)__X, (__v4sf)__Y);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_ps (__m128 __X, __m128 __Y)
+{
+ return (__m128) __builtin_ia32_haddps ((__v4sf)__X, (__v4sf)__Y);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_ps (__m128 __X, __m128 __Y)
+{
+ return (__m128) __builtin_ia32_hsubps ((__v4sf)__X, (__v4sf)__Y);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movehdup_ps (__m128 __X)
+{
+ return (__m128) __builtin_ia32_movshdup ((__v4sf)__X);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_moveldup_ps (__m128 __X)
+{
+ return (__m128) __builtin_ia32_movsldup ((__v4sf)__X);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_addsub_pd (__m128d __X, __m128d __Y)
+{
+ return (__m128d) __builtin_ia32_addsubpd ((__v2df)__X, (__v2df)__Y);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_pd (__m128d __X, __m128d __Y)
+{
+ return (__m128d) __builtin_ia32_haddpd ((__v2df)__X, (__v2df)__Y);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_pd (__m128d __X, __m128d __Y)
+{
+ return (__m128d) __builtin_ia32_hsubpd ((__v2df)__X, (__v2df)__Y);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loaddup_pd (double const *__P)
+{
+ return _mm_load1_pd (__P);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movedup_pd (__m128d __X)
+{
+ return _mm_shuffle_pd (__X, __X, _MM_SHUFFLE2 (0,0));
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_lddqu_si128 (__m128i const *__P)
+{
+ return (__m128i) __builtin_ia32_lddqu ((char const *)__P);
+}
+
+#ifdef __DISABLE_SSE3__
+#undef __DISABLE_SSE3__
+#pragma GCC pop_options
+#endif /* __DISABLE_SSE3__ */
+
+#endif /* _PMMINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _POPCNTINTRIN_H_INCLUDED
+#define _POPCNTINTRIN_H_INCLUDED
+
+#ifndef __POPCNT__
+#pragma GCC push_options
+#pragma GCC target("popcnt")
+#define __DISABLE_POPCNT__
+#endif /* __POPCNT__ */
+
+/* Calculate a number of bits set to 1. */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_popcnt_u32 (unsigned int __X)
+{
+ return __builtin_popcount (__X);
+}
+
+#ifdef __x86_64__
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_popcnt_u64 (unsigned long long __X)
+{
+ return __builtin_popcountll (__X);
+}
+#endif
+
+#ifdef __DISABLE_POPCNT__
+#undef __DISABLE_POPCNT__
+#pragma GCC pop_options
+#endif /* __DISABLE_POPCNT__ */
+
+#endif /* _POPCNTINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2022-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if !defined _X86GPRINTRIN_H_INCLUDED
+# error "Never use <prfchiintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _PRFCHIINTRIN_H_INCLUDED
+#define _PRFCHIINTRIN_H_INCLUDED
+
+#ifdef __x86_64__
+
+
+#ifndef __PREFETCHI__
+#pragma GCC push_options
+#pragma GCC target("prefetchi")
+#define __DISABLE_PREFETCHI__
+#endif /* __PREFETCHI__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_prefetchit0 (void* __P)
+{
+ __builtin_ia32_prefetchi (__P, 3);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_prefetchit1 (void* __P)
+{
+ __builtin_ia32_prefetchi (__P, 2);
+}
+
+#ifdef __DISABLE_PREFETCHI__
+#undef __DISABLE_PREFETCHI__
+#pragma GCC pop_options
+#endif /* __DISABLE_PREFETCHI__ */
+
+#endif /* __x86_64__ */
+
+#endif /* _PRFCHIINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if !defined _IMMINTRIN_H_INCLUDED && !defined _MM3DNOW_H_INCLUDED
+# error "Never use <prfchwintrin.h> directly; include <immintrin.h> or <mm3dnow.h> instead."
+#endif
+
+#ifndef _PRFCHWINTRIN_H_INCLUDED
+#define _PRFCHWINTRIN_H_INCLUDED
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_prefetchw (void *__P)
+{
+ __builtin_prefetch (__P, 1, 3 /* _MM_HINT_T0 */);
+}
+
+#endif /* _PRFCHWINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2019-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+#error "Never use <raointintrin.h> directly; include <x86gprintrin.h> instead."
+#endif // _X86GPRINTRIN_H_INCLUDED
+
+#ifndef __RAOINTINTRIN_H_INCLUDED
+#define __RAOINTINTRIN_H_INCLUDED
+
+#ifndef __RAOINT__
+#pragma GCC push_options
+#pragma GCC target("raoint")
+#define __DISABLE_RAOINT__
+#endif /* __RAOINT__ */
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_aadd_i32 (int *__A, int __B)
+{
+ __builtin_ia32_aadd32 ((int *)__A, __B);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_aand_i32 (int *__A, int __B)
+{
+ __builtin_ia32_aand32 ((int *)__A, __B);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_aor_i32 (int *__A, int __B)
+{
+ __builtin_ia32_aor32 ((int *)__A, __B);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_axor_i32 (int *__A, int __B)
+{
+ __builtin_ia32_axor32 ((int *)__A, __B);
+}
+
+#ifdef __x86_64__
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_aadd_i64 (long long *__A, long long __B)
+{
+ __builtin_ia32_aadd64 ((long long *)__A, __B);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_aand_i64 (long long *__A, long long __B)
+{
+ __builtin_ia32_aand64 ((long long *)__A, __B);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_aor_i64 (long long *__A, long long __B)
+{
+ __builtin_ia32_aor64 ((long long *)__A, __B);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_axor_i64 (long long *__A, long long __B)
+{
+ __builtin_ia32_axor64 ((long long *)__A, __B);
+}
+#endif /* __x86_64__ */
+
+#ifdef __DISABLE_RAOINT__
+#undef __DISABLE_RAOINT__
+#pragma GCC pop_options
+#endif /* __DISABLE_RAOINT__ */
+
+#endif /* __RAOINTINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <rdseedintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _RDSEEDINTRIN_H_INCLUDED
+#define _RDSEEDINTRIN_H_INCLUDED
+
+#ifndef __RDSEED__
+#pragma GCC push_options
+#pragma GCC target("rdseed")
+#define __DISABLE_RDSEED__
+#endif /* __RDSEED__ */
+
+
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_rdseed16_step (unsigned short *__p)
+{
+ return __builtin_ia32_rdseed_hi_step (__p);
+}
+
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_rdseed32_step (unsigned int *__p)
+{
+ return __builtin_ia32_rdseed_si_step (__p);
+}
+
+#ifdef __x86_64__
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_rdseed64_step (unsigned long long *__p)
+{
+ return __builtin_ia32_rdseed_di_step (__p);
+}
+#endif
+
+#ifdef __DISABLE_RDSEED__
+#undef __DISABLE_RDSEED__
+#pragma GCC pop_options
+#endif /* __DISABLE_RDSEED__ */
+
+#endif /* _RDSEEDINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <rtmintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _RTMINTRIN_H_INCLUDED
+#define _RTMINTRIN_H_INCLUDED
+
+#ifndef __RTM__
+#pragma GCC push_options
+#pragma GCC target("rtm")
+#define __DISABLE_RTM__
+#endif /* __RTM__ */
+
+#define _XBEGIN_STARTED (~0u)
+#define _XABORT_EXPLICIT (1 << 0)
+#define _XABORT_RETRY (1 << 1)
+#define _XABORT_CONFLICT (1 << 2)
+#define _XABORT_CAPACITY (1 << 3)
+#define _XABORT_DEBUG (1 << 4)
+#define _XABORT_NESTED (1 << 5)
+#define _XABORT_CODE(x) (((x) >> 24) & 0xFF)
+
+/* Start an RTM code region. Return _XBEGIN_STARTED on success and the
+ abort condition otherwise. */
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xbegin (void)
+{
+ return __builtin_ia32_xbegin ();
+}
+
+/* Specify the end of an RTM code region. If it corresponds to the
+ outermost transaction, then attempts the transaction commit. If the
+ commit fails, then control is transferred to the outermost transaction
+ fallback handler. */
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xend (void)
+{
+ __builtin_ia32_xend ();
+}
+
+/* Force an RTM abort condition. The control is transferred to the
+ outermost transaction fallback handler with the abort condition IMM. */
+#ifdef __OPTIMIZE__
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xabort (const unsigned int __imm)
+{
+ __builtin_ia32_xabort (__imm);
+}
+#else
+#define _xabort(N) __builtin_ia32_xabort (N)
+#endif /* __OPTIMIZE__ */
+
+#ifdef __DISABLE_RTM__
+#undef __DISABLE_RTM__
+#pragma GCC pop_options
+#endif /* __DISABLE_RTM__ */
+
+#endif /* _RTMINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2018-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <serializeintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _SERIALIZE_H_INCLUDED
+#define _SERIALIZE_H_INCLUDED
+
+#ifndef __SERIALIZE__
+#pragma GCC push_options
+#pragma GCC target("serialize")
+#define __DISABLE_SERIALIZE__
+#endif /* __SERIALIZE__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_serialize (void)
+{
+ __builtin_ia32_serialize ();
+}
+
+#ifdef __DISABLE_SERIALIZE__
+#undef __DISABLE_SERIALIZE__
+#pragma GCC pop_options
+#endif /* __DISABLE_SERIALIZE__ */
+
+#endif /* _SERIALIZE_H_INCLUDED. */
--- /dev/null
+/* Copyright (C) 2017-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _SGXINTRIN_H_INCLUDED
+#define _SGXINTRIN_H_INCLUDED
+
+#ifndef __SGX__
+#pragma GCC push_options
+#pragma GCC target("sgx")
+#define __DISABLE_SGX__
+#endif /* __SGX__ */
+
+#define __encls_bc(leaf, b, c, retval) \
+ __asm__ __volatile__ ("encls\n\t" \
+ : "=a" (retval) \
+ : "a" (leaf), "b" (b), "c" (c) \
+ : "cc")
+
+#define __encls_bcd(leaf, b, c, d, retval) \
+ __asm__ __volatile__("encls\n\t" \
+ : "=a" (retval) \
+ : "a" (leaf), "b" (b), "c" (c), "d" (d) \
+ : "cc")
+
+#define __encls_c(leaf, c, retval) \
+ __asm__ __volatile__("encls\n\t" \
+ : "=a" (retval) \
+ : "a" (leaf), "c" (c) \
+ : "cc")
+
+#define __encls_edbgrd(leaf, b, c, retval) \
+ __asm__ __volatile__("encls\n\t" \
+ : "=a" (retval), "=b" (b) \
+ : "a" (leaf), "c" (c))
+
+#define __encls_generic(leaf, b, c, d, retval) \
+ __asm__ __volatile__("encls\n\t" \
+ : "=a" (retval), "=b" (b), "=c" (c), "=d" (d)\
+ : "a" (leaf), "b" (b), "c" (c), "d" (d) \
+ : "cc")
+
+#define __enclu_bc(leaf, b, c, retval) \
+ __asm__ __volatile__("enclu\n\t" \
+ : "=a" (retval) \
+ : "a" (leaf), "b" (b), "c" (c) \
+ : "cc")
+
+#define __enclu_bcd(leaf, b, c, d, retval) \
+ __asm__ __volatile__("enclu\n\t" \
+ : "=a" (retval) \
+ : "a" (leaf), "b" (b), "c" (c), "d" (d) \
+ : "cc")
+
+#define __enclu_eenter(leaf, b, c, retval) \
+ __asm__ __volatile__("enclu\n\t" \
+ : "=a" (retval), "=c" (c) \
+ : "a" (leaf), "b" (b), "c" (c) \
+ : "cc")
+
+#define __enclu_eexit(leaf, b, c, retval) \
+ __asm__ __volatile__("enclu\n\t" \
+ : "=a" (retval), "=c" (c) \
+ : "a" (leaf), "b" (b) \
+ : "cc")
+
+#define __enclu_generic(leaf, b, c, d, retval) \
+ __asm__ __volatile__("enclu\n\t" \
+ : "=a" (retval), "=b" (b), "=c" (c), "=d" (d)\
+ : "a" (leaf), "b" (b), "c" (c), "d" (d) \
+ : "cc")
+
+#define __enclv_bc(leaf, b, c, retval) \
+ __asm__ __volatile__("enclv\n\t" \
+ : "=a" (retval) \
+ : "a" (leaf), "b" (b), "c" (c) \
+ : "cc")
+
+#define __enclv_cd(leaf, c, d, retval) \
+ __asm__ __volatile__("enclv\n\t" \
+ : "=a" (retval) \
+ : "a" (leaf), "c" (c), "d" (d) \
+ : "cc")
+
+#define __enclv_generic(leaf, b, c, d, retval) \
+ __asm__ __volatile__("enclv\n\t" \
+ : "=a" (retval), "=b" (b), "=c" (b), "=d" (d)\
+ : "a" (leaf), "b" (b), "c" (c), "d" (d) \
+ : "cc")
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_encls_u32 (const unsigned int __L, size_t __D[])
+{
+ enum __encls_type
+ {
+ __SGX_ECREATE = 0x00,
+ __SGX_EADD = 0x01,
+ __SGX_EINIT = 0x02,
+ __SGX_EREMOVE = 0x03,
+ __SGX_EDBGRD = 0x04,
+ __SGX_EDBGWR = 0x05,
+ __SGX_EEXTEND = 0x06,
+ __SGX_ELDB = 0x07,
+ __SGX_ELDU = 0x08,
+ __SGX_EBLOCK = 0x09,
+ __SGX_EPA = 0x0A,
+ __SGX_EWB = 0x0B,
+ __SGX_ETRACK = 0x0C,
+ __SGX_EAUG = 0x0D,
+ __SGX_EMODPR = 0x0E,
+ __SGX_EMODT = 0x0F,
+ __SGX_ERDINFO = 0x10,
+ __SGX_ETRACKC = 0x11,
+ __SGX_ELDBC = 0x12,
+ __SGX_ELDUC = 0x13
+ };
+ enum __encls_type __T = (enum __encls_type)__L;
+ unsigned int __R = 0;
+ if (!__builtin_constant_p (__T))
+ __encls_generic (__L, __D[0], __D[1], __D[2], __R);
+ else switch (__T)
+ {
+ case __SGX_ECREATE:
+ case __SGX_EADD:
+ case __SGX_EDBGWR:
+ case __SGX_EEXTEND:
+ case __SGX_EPA:
+ case __SGX_EMODPR:
+ case __SGX_EMODT:
+ case __SGX_EAUG:
+ case __SGX_ERDINFO:
+ __encls_bc (__L, __D[0], __D[1], __R);
+ break;
+ case __SGX_EINIT:
+ case __SGX_ELDB:
+ case __SGX_ELDU:
+ case __SGX_EWB:
+ case __SGX_ELDBC:
+ case __SGX_ELDUC:
+ __encls_bcd (__L, __D[0], __D[1], __D[2], __R);
+ break;
+ case __SGX_EREMOVE:
+ case __SGX_EBLOCK:
+ case __SGX_ETRACK:
+ case __SGX_ETRACKC:
+ __encls_c (__L, __D[1], __R);
+ break;
+ case __SGX_EDBGRD:
+ __encls_edbgrd (__L, __D[0], __D[1], __R);
+ break;
+ default:
+ __encls_generic (__L, __D[0], __D[1], __D[2], __R);
+ }
+ return __R;
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_enclu_u32 (const unsigned int __L, size_t __D[])
+{
+ enum __enclu_type
+ {
+ __SGX_EREPORT = 0x00,
+ __SGX_EGETKEY = 0x01,
+ __SGX_EENTER = 0x02,
+ __SGX_ERESUME = 0x03,
+ __SGX_EEXIT = 0x04,
+ __SGX_EACCEPT = 0x05,
+ __SGX_EMODPE = 0x06,
+ __SGX_EACCEPTCOPY = 0x07
+ };
+ enum __enclu_type __T = (enum __enclu_type) __L;
+ unsigned int __R = 0;
+ if (!__builtin_constant_p (__T))
+ __enclu_generic (__L, __D[0], __D[1], __D[2], __R);
+ else switch (__T)
+ {
+ case __SGX_EREPORT:
+ case __SGX_EACCEPTCOPY:
+ __enclu_bcd (__L, __D[0], __D[1], __D[2], __R);
+ break;
+ case __SGX_EGETKEY:
+ case __SGX_ERESUME:
+ case __SGX_EACCEPT:
+ case __SGX_EMODPE:
+ __enclu_bc (__L, __D[0], __D[1], __R);
+ break;
+ case __SGX_EENTER:
+ __enclu_eenter (__L, __D[0], __D[1], __R);
+ break;
+ case __SGX_EEXIT:
+ __enclu_eexit (__L, __D[0], __D[1], __R);
+ break;
+ default:
+ __enclu_generic (__L, __D[0], __D[1], __D[2], __R);
+ }
+ return __R;
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_enclv_u32 (const unsigned int __L, size_t __D[])
+{
+ enum __enclv_type
+ {
+ __SGX_EDECVIRTCHILD = 0x00,
+ __SGX_EINCVIRTCHILD = 0x01,
+ __SGX_ESETCONTEXT = 0x02
+ };
+ unsigned int __R = 0;
+ if (!__builtin_constant_p (__L))
+ __enclv_generic (__L, __D[0], __D[1], __D[2], __R);
+ else switch (__L)
+ {
+ case __SGX_EDECVIRTCHILD:
+ case __SGX_EINCVIRTCHILD:
+ __enclv_bc (__L, __D[0], __D[1], __R);
+ break;
+ case __SGX_ESETCONTEXT:
+ __enclv_cd (__L, __D[1], __D[2], __R);
+ break;
+ default:
+ __enclv_generic (__L, __D[0], __D[1], __D[2], __R);
+ }
+ return __R;
+}
+
+#ifdef __DISABLE_SGX__
+#undef __DISABLE_SGX__
+#pragma GCC pop_options
+#endif /* __DISABLE_SGX__ */
+
+#endif /* _SGXINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2013-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <shaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _SHAINTRIN_H_INCLUDED
+#define _SHAINTRIN_H_INCLUDED
+
+#ifndef __SHA__
+#pragma GCC push_options
+#pragma GCC target("sha")
+#define __DISABLE_SHA__
+#endif /* __SHA__ */
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha1msg1_epu32 (__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_sha1msg1 ((__v4si) __A, (__v4si) __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha1msg2_epu32 (__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_sha1msg2 ((__v4si) __A, (__v4si) __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha1nexte_epu32 (__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_sha1nexte ((__v4si) __A, (__v4si) __B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha1rnds4_epu32 (__m128i __A, __m128i __B, const int __I)
+{
+ return (__m128i) __builtin_ia32_sha1rnds4 ((__v4si) __A, (__v4si) __B, __I);
+}
+#else
+#define _mm_sha1rnds4_epu32(A, B, I) \
+ ((__m128i) __builtin_ia32_sha1rnds4 ((__v4si)(__m128i)(A), \
+ (__v4si)(__m128i)(B), (int)(I)))
+#endif
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha256msg1_epu32 (__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_sha256msg1 ((__v4si) __A, (__v4si) __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha256msg2_epu32 (__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_sha256msg2 ((__v4si) __A, (__v4si) __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha256rnds2_epu32 (__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i) __builtin_ia32_sha256rnds2 ((__v4si) __A, (__v4si) __B,
+ (__v4si) __C);
+}
+
+#ifdef __DISABLE_SHA__
+#undef __DISABLE_SHA__
+#pragma GCC pop_options
+#endif /* __DISABLE_SHA__ */
+
+#endif /* _SHAINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2007-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+ User Guide and Reference, version 10.0. */
+
+#ifndef _SMMINTRIN_H_INCLUDED
+#define _SMMINTRIN_H_INCLUDED
+
+/* We need definitions from the SSSE3, SSE3, SSE2 and SSE header
+ files. */
+#include <tmmintrin.h>
+
+#ifndef __SSE4_1__
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#define __DISABLE_SSE4_1__
+#endif /* __SSE4_1__ */
+
+/* Rounding mode macros. */
+#define _MM_FROUND_TO_NEAREST_INT 0x00
+#define _MM_FROUND_TO_NEG_INF 0x01
+#define _MM_FROUND_TO_POS_INF 0x02
+#define _MM_FROUND_TO_ZERO 0x03
+#define _MM_FROUND_CUR_DIRECTION 0x04
+
+#define _MM_FROUND_RAISE_EXC 0x00
+#define _MM_FROUND_NO_EXC 0x08
+
+#define _MM_FROUND_NINT \
+ (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_FLOOR \
+ (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_CEIL \
+ (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_TRUNC \
+ (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_RINT \
+ (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_NEARBYINT \
+ (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
+
+/* Test Instruction */
+/* Packed integer 128-bit bitwise comparison. Return 1 if
+ (__V & __M) == 0. */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testz_si128 (__m128i __M, __m128i __V)
+{
+ return __builtin_ia32_ptestz128 ((__v2di)__M, (__v2di)__V);
+}
+
+/* Packed integer 128-bit bitwise comparison. Return 1 if
+ (__V & ~__M) == 0. */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testc_si128 (__m128i __M, __m128i __V)
+{
+ return __builtin_ia32_ptestc128 ((__v2di)__M, (__v2di)__V);
+}
+
+/* Packed integer 128-bit bitwise comparison. Return 1 if
+ (__V & __M) != 0 && (__V & ~__M) != 0. */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testnzc_si128 (__m128i __M, __m128i __V)
+{
+ return __builtin_ia32_ptestnzc128 ((__v2di)__M, (__v2di)__V);
+}
+
+/* Macros for packed integer 128-bit comparison intrinsics. */
+#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
+
+#define _mm_test_all_ones(V) \
+ _mm_testc_si128 ((V), _mm_cmpeq_epi32 ((V), (V)))
+
+#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128 ((M), (V))
+
+/* Packed/scalar double precision floating point rounding. */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_round_pd (__m128d __V, const int __M)
+{
+ return (__m128d) __builtin_ia32_roundpd ((__v2df)__V, __M);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_round_sd(__m128d __D, __m128d __V, const int __M)
+{
+ return (__m128d) __builtin_ia32_roundsd ((__v2df)__D,
+ (__v2df)__V,
+ __M);
+}
+#else
+#define _mm_round_pd(V, M) \
+ ((__m128d) __builtin_ia32_roundpd ((__v2df)(__m128d)(V), (int)(M)))
+
+#define _mm_round_sd(D, V, M) \
+ ((__m128d) __builtin_ia32_roundsd ((__v2df)(__m128d)(D), \
+ (__v2df)(__m128d)(V), (int)(M)))
+#endif
+
+/* Packed/scalar single precision floating point rounding. */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_round_ps (__m128 __V, const int __M)
+{
+ return (__m128) __builtin_ia32_roundps ((__v4sf)__V, __M);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_round_ss (__m128 __D, __m128 __V, const int __M)
+{
+ return (__m128) __builtin_ia32_roundss ((__v4sf)__D,
+ (__v4sf)__V,
+ __M);
+}
+#else
+#define _mm_round_ps(V, M) \
+ ((__m128) __builtin_ia32_roundps ((__v4sf)(__m128)(V), (int)(M)))
+
+#define _mm_round_ss(D, V, M) \
+ ((__m128) __builtin_ia32_roundss ((__v4sf)(__m128)(D), \
+ (__v4sf)(__m128)(V), (int)(M)))
+#endif
+
+/* Macros for ceil/floor intrinsics. */
+#define _mm_ceil_pd(V) _mm_round_pd ((V), _MM_FROUND_CEIL)
+#define _mm_ceil_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_CEIL)
+
+#define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR)
+#define _mm_floor_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_FLOOR)
+
+#define _mm_ceil_ps(V) _mm_round_ps ((V), _MM_FROUND_CEIL)
+#define _mm_ceil_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_CEIL)
+
+#define _mm_floor_ps(V) _mm_round_ps ((V), _MM_FROUND_FLOOR)
+#define _mm_floor_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_FLOOR)
+
+/* SSE4.1 */
+
+/* Integer blend instructions - select data from 2 sources using
+ constant/variable mask. */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blend_epi16 (__m128i __X, __m128i __Y, const int __M)
+{
+ return (__m128i) __builtin_ia32_pblendw128 ((__v8hi)__X,
+ (__v8hi)__Y,
+ __M);
+}
+#else
+#define _mm_blend_epi16(X, Y, M) \
+ ((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(X), \
+ (__v8hi)(__m128i)(Y), (int)(M)))
+#endif
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blendv_epi8 (__m128i __X, __m128i __Y, __m128i __M)
+{
+ return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__X,
+ (__v16qi)__Y,
+ (__v16qi)__M);
+}
+
+/* Single precision floating point blend instructions - select data
+ from 2 sources using constant/variable mask. */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blend_ps (__m128 __X, __m128 __Y, const int __M)
+{
+ return (__m128) __builtin_ia32_blendps ((__v4sf)__X,
+ (__v4sf)__Y,
+ __M);
+}
+#else
+#define _mm_blend_ps(X, Y, M) \
+ ((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), (int)(M)))
+#endif
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blendv_ps (__m128 __X, __m128 __Y, __m128 __M)
+{
+ return (__m128) __builtin_ia32_blendvps ((__v4sf)__X,
+ (__v4sf)__Y,
+ (__v4sf)__M);
+}
+
+/* Double precision floating point blend instructions - select data
+ from 2 sources using constant/variable mask. */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blend_pd (__m128d __X, __m128d __Y, const int __M)
+{
+ return (__m128d) __builtin_ia32_blendpd ((__v2df)__X,
+ (__v2df)__Y,
+ __M);
+}
+#else
+#define _mm_blend_pd(X, Y, M) \
+ ((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), (int)(M)))
+#endif
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blendv_pd (__m128d __X, __m128d __Y, __m128d __M)
+{
+ return (__m128d) __builtin_ia32_blendvpd ((__v2df)__X,
+ (__v2df)__Y,
+ (__v2df)__M);
+}
+
+/* Dot product instructions with mask-defined summing and zeroing parts
+ of result. */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_dp_ps (__m128 __X, __m128 __Y, const int __M)
+{
+ return (__m128) __builtin_ia32_dpps ((__v4sf)__X,
+ (__v4sf)__Y,
+ __M);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_dp_pd (__m128d __X, __m128d __Y, const int __M)
+{
+ return (__m128d) __builtin_ia32_dppd ((__v2df)__X,
+ (__v2df)__Y,
+ __M);
+}
+#else
+#define _mm_dp_ps(X, Y, M) \
+ ((__m128) __builtin_ia32_dpps ((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), (int)(M)))
+
+#define _mm_dp_pd(X, Y, M) \
+ ((__m128d) __builtin_ia32_dppd ((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), (int)(M)))
+#endif
+
+/* Packed integer 64-bit comparison, zeroing or filling with ones
+ corresponding parts of result. */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi64 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) ((__v2di)__X == (__v2di)__Y);
+}
+
+/* Min/max packed integer instructions. */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epi8 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_pminsb128 ((__v16qi)__X, (__v16qi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epi8 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi)__X, (__v16qi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epu16 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_pminuw128 ((__v8hi)__X, (__v8hi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epu16 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi)__X, (__v8hi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epi32 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_pminsd128 ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epi32 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epu32 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_pminud128 ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epu32 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_pmaxud128 ((__v4si)__X, (__v4si)__Y);
+}
+
+/* Packed integer 32-bit multiplication with truncation of upper
+ halves of results. */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mullo_epi32 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) ((__v4su)__X * (__v4su)__Y);
+}
+
+/* Packed integer 32-bit multiplication of 2 pairs of operands
+ with two 64-bit results. */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_epi32 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__X, (__v4si)__Y);
+}
+
+/* Insert single precision float into packed single precision array
+ element selected by index N. The bits [7-6] of N define S
+ index, the bits [5-4] define D index, and bits [3-0] define
+ zeroing mask for D. */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_ps (__m128 __D, __m128 __S, const int __N)
+{
+ return (__m128) __builtin_ia32_insertps128 ((__v4sf)__D,
+ (__v4sf)__S,
+ __N);
+}
+#else
+#define _mm_insert_ps(D, S, N) \
+ ((__m128) __builtin_ia32_insertps128 ((__v4sf)(__m128)(D), \
+ (__v4sf)(__m128)(S), (int)(N)))
+#endif
+
+/* Helper macro to create the N value for _mm_insert_ps. */
+#define _MM_MK_INSERTPS_NDX(S, D, M) (((S) << 6) | ((D) << 4) | (M))
+
+/* Extract binary representation of single precision float from packed
+ single precision array element of X selected by index N. */
+
+#ifdef __OPTIMIZE__
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_ps (__m128 __X, const int __N)
+{
+ union { int __i; float __f; } __tmp;
+ __tmp.__f = __builtin_ia32_vec_ext_v4sf ((__v4sf)__X, __N);
+ return __tmp.__i;
+}
+#else
+#define _mm_extract_ps(X, N) \
+ (__extension__ \
+ ({ \
+ union { int __i; float __f; } __tmp; \
+ __tmp.__f = __builtin_ia32_vec_ext_v4sf ((__v4sf)(__m128)(X), \
+ (int)(N)); \
+ __tmp.__i; \
+ }))
+#endif
+
+/* Extract binary representation of single precision float into
+ D from packed single precision array element of S selected
+ by index N. */
+#define _MM_EXTRACT_FLOAT(D, S, N) \
+ { (D) = __builtin_ia32_vec_ext_v4sf ((__v4sf)(S), (N)); }
+
+/* Extract specified single precision float element into the lower
+ part of __m128. */
+#define _MM_PICK_OUT_PS(X, N) \
+ _mm_insert_ps (_mm_setzero_ps (), (X), \
+ _MM_MK_INSERTPS_NDX ((N), 0, 0x0e))
+
+/* Insert integer, S, into packed integer array element of D
+ selected by index N. */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi8 (__m128i __D, int __S, const int __N)
+{
+ return (__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)__D,
+ __S, __N);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi32 (__m128i __D, int __S, const int __N)
+{
+ return (__m128i) __builtin_ia32_vec_set_v4si ((__v4si)__D,
+ __S, __N);
+}
+
+#ifdef __x86_64__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi64 (__m128i __D, long long __S, const int __N)
+{
+ return (__m128i) __builtin_ia32_vec_set_v2di ((__v2di)__D,
+ __S, __N);
+}
+#endif
+#else
+#define _mm_insert_epi8(D, S, N) \
+ ((__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)(__m128i)(D), \
+ (int)(S), (int)(N)))
+
+#define _mm_insert_epi32(D, S, N) \
+ ((__m128i) __builtin_ia32_vec_set_v4si ((__v4si)(__m128i)(D), \
+ (int)(S), (int)(N)))
+
+#ifdef __x86_64__
+#define _mm_insert_epi64(D, S, N) \
+ ((__m128i) __builtin_ia32_vec_set_v2di ((__v2di)(__m128i)(D), \
+ (long long)(S), (int)(N)))
+#endif
+#endif
+
+/* Extract integer from packed integer array element of X selected by
+ index N. */
+
+#ifdef __OPTIMIZE__
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_epi8 (__m128i __X, const int __N)
+{
+ return (unsigned char) __builtin_ia32_vec_ext_v16qi ((__v16qi)__X, __N);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_epi32 (__m128i __X, const int __N)
+{
+ return __builtin_ia32_vec_ext_v4si ((__v4si)__X, __N);
+}
+
+#ifdef __x86_64__
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_epi64 (__m128i __X, const int __N)
+{
+ return __builtin_ia32_vec_ext_v2di ((__v2di)__X, __N);
+}
+#endif
+#else
+#define _mm_extract_epi8(X, N) \
+ ((int) (unsigned char) __builtin_ia32_vec_ext_v16qi ((__v16qi)(__m128i)(X), (int)(N)))
+#define _mm_extract_epi32(X, N) \
+ ((int) __builtin_ia32_vec_ext_v4si ((__v4si)(__m128i)(X), (int)(N)))
+
+#ifdef __x86_64__
+#define _mm_extract_epi64(X, N) \
+ ((long long) __builtin_ia32_vec_ext_v2di ((__v2di)(__m128i)(X), (int)(N)))
+#endif
+#endif
+
+/* Return horizontal packed word minimum and its index in bits [15:0]
+ and bits [18:16] respectively. */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_minpos_epu16 (__m128i __X)
+{
+ return (__m128i) __builtin_ia32_phminposuw128 ((__v8hi)__X);
+}
+
+/* Packed integer sign-extension. */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi8_epi32 (__m128i __X)
+{
+ return (__m128i) __builtin_ia32_pmovsxbd128 ((__v16qi)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi16_epi32 (__m128i __X)
+{
+ return (__m128i) __builtin_ia32_pmovsxwd128 ((__v8hi)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi8_epi64 (__m128i __X)
+{
+ return (__m128i) __builtin_ia32_pmovsxbq128 ((__v16qi)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi32_epi64 (__m128i __X)
+{
+ return (__m128i) __builtin_ia32_pmovsxdq128 ((__v4si)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi16_epi64 (__m128i __X)
+{
+ return (__m128i) __builtin_ia32_pmovsxwq128 ((__v8hi)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi8_epi16 (__m128i __X)
+{
+ return (__m128i) __builtin_ia32_pmovsxbw128 ((__v16qi)__X);
+}
+
+/* Packed integer zero-extension. */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu8_epi32 (__m128i __X)
+{
+ return (__m128i) __builtin_ia32_pmovzxbd128 ((__v16qi)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu16_epi32 (__m128i __X)
+{
+ return (__m128i) __builtin_ia32_pmovzxwd128 ((__v8hi)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu8_epi64 (__m128i __X)
+{
+ return (__m128i) __builtin_ia32_pmovzxbq128 ((__v16qi)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu32_epi64 (__m128i __X)
+{
+ return (__m128i) __builtin_ia32_pmovzxdq128 ((__v4si)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu16_epi64 (__m128i __X)
+{
+ return (__m128i) __builtin_ia32_pmovzxwq128 ((__v8hi)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu8_epi16 (__m128i __X)
+{
+ return (__m128i) __builtin_ia32_pmovzxbw128 ((__v16qi)__X);
+}
+
+/* Pack 8 double words from 2 operands into 8 words of result with
+ unsigned saturation. */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packus_epi32 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_packusdw128 ((__v4si)__X, (__v4si)__Y);
+}
+
+/* Sum absolute 8-bit integer difference of adjacent groups of 4
+ byte integers in the first 2 operands. Starting offsets within
+ operands are determined by the 3rd mask operand. */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mpsadbw_epu8 (__m128i __X, __m128i __Y, const int __M)
+{
+ return (__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)__X,
+ (__v16qi)__Y, __M);
+}
+#else
+#define _mm_mpsadbw_epu8(X, Y, M) \
+ ((__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)(__m128i)(X), \
+ (__v16qi)(__m128i)(Y), (int)(M)))
+#endif
+
+/* Load double quadword using non-temporal aligned hint. */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_load_si128 (__m128i *__X)
+{
+ return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __X);
+}
+
+#ifndef __SSE4_2__
+#pragma GCC push_options
+#pragma GCC target("sse4.2")
+#define __DISABLE_SSE4_2__
+#endif /* __SSE4_2__ */
+
+/* These macros specify the source data format. */
+#define _SIDD_UBYTE_OPS 0x00
+#define _SIDD_UWORD_OPS 0x01
+#define _SIDD_SBYTE_OPS 0x02
+#define _SIDD_SWORD_OPS 0x03
+
+/* These macros specify the comparison operation. */
+#define _SIDD_CMP_EQUAL_ANY 0x00
+#define _SIDD_CMP_RANGES 0x04
+#define _SIDD_CMP_EQUAL_EACH 0x08
+#define _SIDD_CMP_EQUAL_ORDERED 0x0c
+
+/* These macros specify the polarity. */
+#define _SIDD_POSITIVE_POLARITY 0x00
+#define _SIDD_NEGATIVE_POLARITY 0x10
+#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
+#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30
+
+/* These macros specify the output selection in _mm_cmpXstri (). */
+#define _SIDD_LEAST_SIGNIFICANT 0x00
+#define _SIDD_MOST_SIGNIFICANT 0x40
+
+/* These macros specify the output selection in _mm_cmpXstrm (). */
+#define _SIDD_BIT_MASK 0x00
+#define _SIDD_UNIT_MASK 0x40
+
+/* Intrinsics for text/string processing. */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpistrm (__m128i __X, __m128i __Y, const int __M)
+{
+ return (__m128i) __builtin_ia32_pcmpistrm128 ((__v16qi)__X,
+ (__v16qi)__Y,
+ __M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpistri (__m128i __X, __m128i __Y, const int __M)
+{
+ return __builtin_ia32_pcmpistri128 ((__v16qi)__X,
+ (__v16qi)__Y,
+ __M);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpestrm (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
+{
+ return (__m128i) __builtin_ia32_pcmpestrm128 ((__v16qi)__X, __LX,
+ (__v16qi)__Y, __LY,
+ __M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpestri (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
+{
+ return __builtin_ia32_pcmpestri128 ((__v16qi)__X, __LX,
+ (__v16qi)__Y, __LY,
+ __M);
+}
+#else
+#define _mm_cmpistrm(X, Y, M) \
+ ((__m128i) __builtin_ia32_pcmpistrm128 ((__v16qi)(__m128i)(X), \
+ (__v16qi)(__m128i)(Y), (int)(M)))
+#define _mm_cmpistri(X, Y, M) \
+ ((int) __builtin_ia32_pcmpistri128 ((__v16qi)(__m128i)(X), \
+ (__v16qi)(__m128i)(Y), (int)(M)))
+
+#define _mm_cmpestrm(X, LX, Y, LY, M) \
+ ((__m128i) __builtin_ia32_pcmpestrm128 ((__v16qi)(__m128i)(X), \
+ (int)(LX), (__v16qi)(__m128i)(Y), \
+ (int)(LY), (int)(M)))
+#define _mm_cmpestri(X, LX, Y, LY, M) \
+ ((int) __builtin_ia32_pcmpestri128 ((__v16qi)(__m128i)(X), (int)(LX), \
+ (__v16qi)(__m128i)(Y), (int)(LY), \
+ (int)(M)))
+#endif
+
+/* Intrinsics for text/string processing and reading values of
+ EFlags. */
+
+#ifdef __OPTIMIZE__
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpistra (__m128i __X, __m128i __Y, const int __M)
+{
+ return __builtin_ia32_pcmpistria128 ((__v16qi)__X,
+ (__v16qi)__Y,
+ __M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpistrc (__m128i __X, __m128i __Y, const int __M)
+{
+ return __builtin_ia32_pcmpistric128 ((__v16qi)__X,
+ (__v16qi)__Y,
+ __M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpistro (__m128i __X, __m128i __Y, const int __M)
+{
+ return __builtin_ia32_pcmpistrio128 ((__v16qi)__X,
+ (__v16qi)__Y,
+ __M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpistrs (__m128i __X, __m128i __Y, const int __M)
+{
+ return __builtin_ia32_pcmpistris128 ((__v16qi)__X,
+ (__v16qi)__Y,
+ __M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpistrz (__m128i __X, __m128i __Y, const int __M)
+{
+ return __builtin_ia32_pcmpistriz128 ((__v16qi)__X,
+ (__v16qi)__Y,
+ __M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpestra (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
+{
+ return __builtin_ia32_pcmpestria128 ((__v16qi)__X, __LX,
+ (__v16qi)__Y, __LY,
+ __M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpestrc (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
+{
+ return __builtin_ia32_pcmpestric128 ((__v16qi)__X, __LX,
+ (__v16qi)__Y, __LY,
+ __M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpestro (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
+{
+ return __builtin_ia32_pcmpestrio128 ((__v16qi)__X, __LX,
+ (__v16qi)__Y, __LY,
+ __M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpestrs (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
+{
+ return __builtin_ia32_pcmpestris128 ((__v16qi)__X, __LX,
+ (__v16qi)__Y, __LY,
+ __M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpestrz (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
+{
+ return __builtin_ia32_pcmpestriz128 ((__v16qi)__X, __LX,
+ (__v16qi)__Y, __LY,
+ __M);
+}
+#else
+#define _mm_cmpistra(X, Y, M) \
+ ((int) __builtin_ia32_pcmpistria128 ((__v16qi)(__m128i)(X), \
+ (__v16qi)(__m128i)(Y), (int)(M)))
+#define _mm_cmpistrc(X, Y, M) \
+ ((int) __builtin_ia32_pcmpistric128 ((__v16qi)(__m128i)(X), \
+ (__v16qi)(__m128i)(Y), (int)(M)))
+#define _mm_cmpistro(X, Y, M) \
+ ((int) __builtin_ia32_pcmpistrio128 ((__v16qi)(__m128i)(X), \
+ (__v16qi)(__m128i)(Y), (int)(M)))
+#define _mm_cmpistrs(X, Y, M) \
+ ((int) __builtin_ia32_pcmpistris128 ((__v16qi)(__m128i)(X), \
+ (__v16qi)(__m128i)(Y), (int)(M)))
+#define _mm_cmpistrz(X, Y, M) \
+ ((int) __builtin_ia32_pcmpistriz128 ((__v16qi)(__m128i)(X), \
+ (__v16qi)(__m128i)(Y), (int)(M)))
+
+#define _mm_cmpestra(X, LX, Y, LY, M) \
+ ((int) __builtin_ia32_pcmpestria128 ((__v16qi)(__m128i)(X), (int)(LX), \
+ (__v16qi)(__m128i)(Y), (int)(LY), \
+ (int)(M)))
+#define _mm_cmpestrc(X, LX, Y, LY, M) \
+ ((int) __builtin_ia32_pcmpestric128 ((__v16qi)(__m128i)(X), (int)(LX), \
+ (__v16qi)(__m128i)(Y), (int)(LY), \
+ (int)(M)))
+#define _mm_cmpestro(X, LX, Y, LY, M) \
+ ((int) __builtin_ia32_pcmpestrio128 ((__v16qi)(__m128i)(X), (int)(LX), \
+ (__v16qi)(__m128i)(Y), (int)(LY), \
+ (int)(M)))
+#define _mm_cmpestrs(X, LX, Y, LY, M) \
+ ((int) __builtin_ia32_pcmpestris128 ((__v16qi)(__m128i)(X), (int)(LX), \
+ (__v16qi)(__m128i)(Y), (int)(LY), \
+ (int)(M)))
+#define _mm_cmpestrz(X, LX, Y, LY, M) \
+ ((int) __builtin_ia32_pcmpestriz128 ((__v16qi)(__m128i)(X), (int)(LX), \
+ (__v16qi)(__m128i)(Y), (int)(LY), \
+ (int)(M)))
+#endif
+
+/* Packed integer 64-bit comparison, zeroing or filling with ones
+ corresponding parts of result. */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi64 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) ((__v2di)__X > (__v2di)__Y);
+}
+
+#ifdef __DISABLE_SSE4_2__
+#undef __DISABLE_SSE4_2__
+#pragma GCC pop_options
+#endif /* __DISABLE_SSE4_2__ */
+
+#ifdef __DISABLE_SSE4_1__
+#undef __DISABLE_SSE4_1__
+#pragma GCC pop_options
+#endif /* __DISABLE_SSE4_1__ */
+
+#include <popcntintrin.h>
+
+#ifndef __CRC32__
+#pragma GCC push_options
+#pragma GCC target("crc32")
+#define __DISABLE_CRC32__
+#endif /* __CRC32__ */
+
+/* Accumulate CRC32 (polynomial 0x11EDC6F41) value. */
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_crc32_u8 (unsigned int __C, unsigned char __V)
+{
+ return __builtin_ia32_crc32qi (__C, __V);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_crc32_u16 (unsigned int __C, unsigned short __V)
+{
+ return __builtin_ia32_crc32hi (__C, __V);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_crc32_u32 (unsigned int __C, unsigned int __V)
+{
+ return __builtin_ia32_crc32si (__C, __V);
+}
+
+#ifdef __x86_64__
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_crc32_u64 (unsigned long long __C, unsigned long long __V)
+{
+ return __builtin_ia32_crc32di (__C, __V);
+}
+#endif
+
+#ifdef __DISABLE_CRC32__
+#undef __DISABLE_CRC32__
+#pragma GCC pop_options
+#endif /* __DISABLE_CRC32__ */
+
+#endif /* _SMMINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2010-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <tbmintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _TBMINTRIN_H_INCLUDED
+#define _TBMINTRIN_H_INCLUDED
+
+#ifndef __TBM__
+#pragma GCC push_options
+#pragma GCC target("tbm")
+#define __DISABLE_TBM__
+#endif /* __TBM__ */
+
+#ifdef __OPTIMIZE__
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bextri_u32 (unsigned int __X, const unsigned int __I)
+{
+ return __builtin_ia32_bextri_u32 (__X, __I);
+}
+#else
+#define __bextri_u32(X, I) \
+ ((unsigned int)__builtin_ia32_bextri_u32 ((unsigned int)(X), \
+ (unsigned int)(I)))
+#endif /*__OPTIMIZE__ */
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blcfill_u32 (unsigned int __X)
+{
+ return __X & (__X + 1);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blci_u32 (unsigned int __X)
+{
+ return __X | ~(__X + 1);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blcic_u32 (unsigned int __X)
+{
+ return ~__X & (__X + 1);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blcmsk_u32 (unsigned int __X)
+{
+ return __X ^ (__X + 1);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blcs_u32 (unsigned int __X)
+{
+ return __X | (__X + 1);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsfill_u32 (unsigned int __X)
+{
+ return __X | (__X - 1);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsic_u32 (unsigned int __X)
+{
+ return ~__X | (__X - 1);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__t1mskc_u32 (unsigned int __X)
+{
+ return ~__X | (__X + 1);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__tzmsk_u32 (unsigned int __X)
+{
+ return ~__X & (__X - 1);
+}
+
+
+
+#ifdef __x86_64__
+#ifdef __OPTIMIZE__
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bextri_u64 (unsigned long long __X, const unsigned int __I)
+{
+ return __builtin_ia32_bextri_u64 (__X, __I);
+}
+#else
+#define __bextri_u64(X, I) \
+ ((unsigned long long)__builtin_ia32_bextri_u64 ((unsigned long long)(X), \
+ (unsigned long long)(I)))
+#endif /*__OPTIMIZE__ */
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blcfill_u64 (unsigned long long __X)
+{
+ return __X & (__X + 1);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blci_u64 (unsigned long long __X)
+{
+ return __X | ~(__X + 1);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blcic_u64 (unsigned long long __X)
+{
+ return ~__X & (__X + 1);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blcmsk_u64 (unsigned long long __X)
+{
+ return __X ^ (__X + 1);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blcs_u64 (unsigned long long __X)
+{
+ return __X | (__X + 1);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsfill_u64 (unsigned long long __X)
+{
+ return __X | (__X - 1);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsic_u64 (unsigned long long __X)
+{
+ return ~__X | (__X - 1);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__t1mskc_u64 (unsigned long long __X)
+{
+ return ~__X | (__X + 1);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__tzmsk_u64 (unsigned long long __X)
+{
+ return ~__X & (__X - 1);
+}
+
+
+#endif /* __x86_64__ */
+
+#ifdef __DISABLE_TBM__
+#undef __DISABLE_TBM__
+#pragma GCC pop_options
+#endif /* __DISABLE_TBM__ */
+
+#endif /* _TBMINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2006-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+ User Guide and Reference, version 9.1. */
+
+#ifndef _TMMINTRIN_H_INCLUDED
+#define _TMMINTRIN_H_INCLUDED
+
+/* We need definitions from the SSE3, SSE2 and SSE header files*/
+#include <pmmintrin.h>
+
+#ifndef __SSSE3__
+#pragma GCC push_options
+#pragma GCC target("ssse3")
+#define __DISABLE_SSSE3__
+#endif /* __SSSE3__ */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_epi16 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_phaddw128 ((__v8hi)__X, (__v8hi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_epi32 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_phaddd128 ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadds_epi16 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_phaddsw128 ((__v8hi)__X, (__v8hi)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_pi16 (__m64 __X, __m64 __Y)
+{
+ return (__m64) __builtin_ia32_phaddw ((__v4hi)__X, (__v4hi)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_pi32 (__m64 __X, __m64 __Y)
+{
+ return (__m64) __builtin_ia32_phaddd ((__v2si)__X, (__v2si)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadds_pi16 (__m64 __X, __m64 __Y)
+{
+ return (__m64) __builtin_ia32_phaddsw ((__v4hi)__X, (__v4hi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_epi16 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_phsubw128 ((__v8hi)__X, (__v8hi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_epi32 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_phsubd128 ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsubs_epi16 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_phsubsw128 ((__v8hi)__X, (__v8hi)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_pi16 (__m64 __X, __m64 __Y)
+{
+ return (__m64) __builtin_ia32_phsubw ((__v4hi)__X, (__v4hi)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_pi32 (__m64 __X, __m64 __Y)
+{
+ return (__m64) __builtin_ia32_phsubd ((__v2si)__X, (__v2si)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsubs_pi16 (__m64 __X, __m64 __Y)
+{
+ return (__m64) __builtin_ia32_phsubsw ((__v4hi)__X, (__v4hi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maddubs_epi16 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_pmaddubsw128 ((__v16qi)__X, (__v16qi)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maddubs_pi16 (__m64 __X, __m64 __Y)
+{
+ return (__m64) __builtin_ia32_pmaddubsw ((__v8qi)__X, (__v8qi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhrs_epi16 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_pmulhrsw128 ((__v8hi)__X, (__v8hi)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhrs_pi16 (__m64 __X, __m64 __Y)
+{
+ return (__m64) __builtin_ia32_pmulhrsw ((__v4hi)__X, (__v4hi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_epi8 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_pshufb128 ((__v16qi)__X, (__v16qi)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_pi8 (__m64 __X, __m64 __Y)
+{
+ return (__m64) __builtin_ia32_pshufb ((__v8qi)__X, (__v8qi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_epi8 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_psignb128 ((__v16qi)__X, (__v16qi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_epi16 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_psignw128 ((__v8hi)__X, (__v8hi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_epi32 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_psignd128 ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_pi8 (__m64 __X, __m64 __Y)
+{
+ return (__m64) __builtin_ia32_psignb ((__v8qi)__X, (__v8qi)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_pi16 (__m64 __X, __m64 __Y)
+{
+ return (__m64) __builtin_ia32_psignw ((__v4hi)__X, (__v4hi)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_pi32 (__m64 __X, __m64 __Y)
+{
+ return (__m64) __builtin_ia32_psignd ((__v2si)__X, (__v2si)__Y);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
+{
+ return (__m128i) __builtin_ia32_palignr128 ((__v2di)__X,
+ (__v2di)__Y, __N * 8);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_alignr_pi8(__m64 __X, __m64 __Y, const int __N)
+{
+ return (__m64) __builtin_ia32_palignr ((__v1di)__X,
+ (__v1di)__Y, __N * 8);
+}
+#else
+#define _mm_alignr_epi8(X, Y, N) \
+ ((__m128i) __builtin_ia32_palignr128 ((__v2di)(__m128i)(X), \
+ (__v2di)(__m128i)(Y), \
+ (int)(N) * 8))
+#define _mm_alignr_pi8(X, Y, N) \
+ ((__m64) __builtin_ia32_palignr ((__v1di)(__m64)(X), \
+ (__v1di)(__m64)(Y), \
+ (int)(N) * 8))
+#endif
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_epi8 (__m128i __X)
+{
+ return (__m128i) __builtin_ia32_pabsb128 ((__v16qi)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_epi16 (__m128i __X)
+{
+ return (__m128i) __builtin_ia32_pabsw128 ((__v8hi)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_epi32 (__m128i __X)
+{
+ return (__m128i) __builtin_ia32_pabsd128 ((__v4si)__X);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_pi8 (__m64 __X)
+{
+ return (__m64) __builtin_ia32_pabsb ((__v8qi)__X);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_pi16 (__m64 __X)
+{
+ return (__m64) __builtin_ia32_pabsw ((__v4hi)__X);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_pi32 (__m64 __X)
+{
+ return (__m64) __builtin_ia32_pabsd ((__v2si)__X);
+}
+
+#ifdef __DISABLE_SSSE3__
+#undef __DISABLE_SSSE3__
+#pragma GCC pop_options
+#endif /* __DISABLE_SSSE3__ */
+
+#endif /* _TMMINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <tsxldtrkintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _TSXLDTRKINTRIN_H_INCLUDED
+#define _TSXLDTRKINTRIN_H_INCLUDED
+
+#if !defined(__TSXLDTRK__)
+#pragma GCC push_options
+#pragma GCC target("tsxldtrk")
+#define __DISABLE_TSXLDTRK__
+#endif /* __TSXLDTRK__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xsusldtrk (void)
+{
+ __builtin_ia32_xsusldtrk ();
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xresldtrk (void)
+{
+ __builtin_ia32_xresldtrk ();
+}
+
+#ifdef __DISABLE_TSXLDTRK__
+#undef __DISABLE_TSXLDTRK__
+#pragma GCC pop_options
+#endif /* __DISABLE_TSXLDTRK__ */
+
+#endif /* _TSXLDTRKINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <uintrintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _UINTRNTRIN_H_INCLUDED
+#define _UINTRNTRIN_H_INCLUDED
+
+#ifdef __x86_64__
+
+#ifndef __UINTR__
+#pragma GCC push_options
+#pragma GCC target ("uintr")
+#define __DISABLE_UINTR__
+#endif /* __UINTR__ */
+
+struct __uintr_frame
+{
+ /* RIP of the interrupted user process. */
+ unsigned long long rip;
+ /* RFLAGS of the interrupted user process. */
+ unsigned long long rflags;
+ /* RSP of the interrupted user process. */
+ unsigned long long rsp;
+};
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_clui (void)
+{
+ __builtin_ia32_clui ();
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_stui (void)
+{
+ __builtin_ia32_stui ();
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_senduipi (unsigned long long __R)
+{
+ __builtin_ia32_senduipi (__R);
+}
+
+extern __inline unsigned char
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_testui (void)
+{
+ return __builtin_ia32_testui ();
+}
+
+#ifdef __DISABLE_UINTR__
+#undef __DISABLE_UINTR__
+#pragma GCC pop_options
+#endif /* __DISABLE_UINTR__ */
+
+#endif
+
+#endif /* _UINTRNTRIN_H_INCLUDED. */
--- /dev/null
+/* Copyright (C) 2017-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef __VAESINTRIN_H_INCLUDED
+#define __VAESINTRIN_H_INCLUDED
+
+#if !defined(__VAES__) || !defined(__AVX__)
+#pragma GCC push_options
+#pragma GCC target("vaes,avx")
+#define __DISABLE_VAES__
+#endif /* __VAES__ */
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_aesdec_epi128 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_vaesdec_v32qi ((__v32qi) __A, (__v32qi) __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_aesdeclast_epi128 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_vaesdeclast_v32qi ((__v32qi) __A,
+ (__v32qi) __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_aesenc_epi128 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_vaesenc_v32qi ((__v32qi) __A, (__v32qi) __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_aesenclast_epi128 (__m256i __A, __m256i __B)
+{
+ return (__m256i)__builtin_ia32_vaesenclast_v32qi ((__v32qi) __A,
+ (__v32qi) __B);
+}
+
+#ifdef __DISABLE_VAES__
+#undef __DISABLE_VAES__
+#pragma GCC pop_options
+#endif /* __DISABLE_VAES__ */
+
+
+#if !defined(__VAES__) || !defined(__AVX512F__)
+#pragma GCC push_options
+#pragma GCC target("vaes,avx512f")
+#define __DISABLE_VAESF__
+#endif /* __VAES__ */
+
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_aesdec_epi128 (__m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_vaesdec_v64qi ((__v64qi) __A, (__v64qi) __B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_aesdeclast_epi128 (__m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_vaesdeclast_v64qi ((__v64qi) __A,
+ (__v64qi) __B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_aesenc_epi128 (__m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_vaesenc_v64qi ((__v64qi) __A, (__v64qi) __B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_aesenclast_epi128 (__m512i __A, __m512i __B)
+{
+ return (__m512i)__builtin_ia32_vaesenclast_v64qi ((__v64qi) __A,
+ (__v64qi) __B);
+}
+
+#ifdef __DISABLE_VAESF__
+#undef __DISABLE_VAESF__
+#pragma GCC pop_options
+#endif /* __DISABLE_VAES__ */
+
+#endif /* __VAESINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2014-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <vpclmulqdqintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _VPCLMULQDQINTRIN_H_INCLUDED
+#define _VPCLMULQDQINTRIN_H_INCLUDED
+
+#if !defined(__VPCLMULQDQ__) || !defined(__AVX512F__)
+#pragma GCC push_options
+#pragma GCC target("vpclmulqdq,avx512f")
+#define __DISABLE_VPCLMULQDQF__
+#endif /* __VPCLMULQDQF__ */
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_clmulepi64_epi128 (__m512i __A, __m512i __B, const int __C)
+{
+ return (__m512i) __builtin_ia32_vpclmulqdq_v8di ((__v8di)__A,
+ (__v8di) __B, __C);
+}
+#else
+#define _mm512_clmulepi64_epi128(A, B, C) \
+ ((__m512i) __builtin_ia32_vpclmulqdq_v8di ((__v8di)(__m512i)(A), \
+ (__v8di)(__m512i)(B), (int)(C)))
+#endif
+
+#ifdef __DISABLE_VPCLMULQDQF__
+#undef __DISABLE_VPCLMULQDQF__
+#pragma GCC pop_options
+#endif /* __DISABLE_VPCLMULQDQF__ */
+
+#if !defined(__VPCLMULQDQ__) || !defined(__AVX__)
+#pragma GCC push_options
+#pragma GCC target("vpclmulqdq,avx")
+#define __DISABLE_VPCLMULQDQ__
+#endif /* __VPCLMULQDQ__ */
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_clmulepi64_epi128 (__m256i __A, __m256i __B, const int __C)
+{
+ return (__m256i) __builtin_ia32_vpclmulqdq_v4di ((__v4di)__A,
+ (__v4di) __B, __C);
+}
+#else
+#define _mm256_clmulepi64_epi128(A, B, C) \
+ ((__m256i) __builtin_ia32_vpclmulqdq_v4di ((__v4di)(__m256i)(A), \
+ (__v4di)(__m256i)(B), (int)(C)))
+#endif
+
+#ifdef __DISABLE_VPCLMULQDQ__
+#undef __DISABLE_VPCLMULQDQ__
+#pragma GCC pop_options
+#endif /* __DISABLE_VPCLMULQDQ__ */
+
+#endif /* _VPCLMULQDQINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2018-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <waitpkgintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _WAITPKG_H_INCLUDED
+#define _WAITPKG_H_INCLUDED
+
+#ifndef __WAITPKG__
+#pragma GCC push_options
+#pragma GCC target("waitpkg")
+#define __DISABLE_WAITPKG__
+#endif /* __WAITPKG__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_umonitor (void *__A)
+{
+ __builtin_ia32_umonitor (__A);
+}
+
+extern __inline unsigned char
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_umwait (unsigned int __A, unsigned long long __B)
+{
+ return __builtin_ia32_umwait (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_tpause (unsigned int __A, unsigned long long __B)
+{
+ return __builtin_ia32_tpause (__A, __B);
+}
+
+#ifdef __DISABLE_WAITPKG__
+#undef __DISABLE_WAITPKG__
+#pragma GCC pop_options
+#endif /* __DISABLE_WAITPKG__ */
+
+#endif /* _WAITPKG_H_INCLUDED. */
--- /dev/null
+/* Copyright (C) 2018-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <wbnoinvdintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _WBNOINVDINTRIN_H_INCLUDED
+#define _WBNOINVDINTRIN_H_INCLUDED
+
+#ifndef __WBNOINVD__
+#pragma GCC push_options
+#pragma GCC target("wbnoinvd")
+#define __DISABLE_WBNOINVD__
+#endif /* __WBNOINVD__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_wbnoinvd (void)
+{
+ __builtin_ia32_wbnoinvd ();
+}
+
+#ifdef __DISABLE_WBNOINVD__
+#undef __DISABLE_WBNOINVD__
+#pragma GCC pop_options
+#endif /* __DISABLE_WBNOINVD__ */
+
+#endif /* _WBNOINVDINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2008-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+ User Guide and Reference, version 10.1. */
+
+#ifndef _WMMINTRIN_H_INCLUDED
+#define _WMMINTRIN_H_INCLUDED
+
+/* We need definitions from the SSE2 header file. */
+#include <emmintrin.h>
+
+/* AES */
+
+#if !defined(__AES__) || !defined(__SSE2__)
+#pragma GCC push_options
+#pragma GCC target("aes,sse2")
+#define __DISABLE_AES__
+#endif /* __AES__ */
+
+/* Performs 1 round of AES decryption of the first m128i using
+ the second m128i as a round key. */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesdec_si128 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_aesdec128 ((__v2di)__X, (__v2di)__Y);
+}
+
+/* Performs the last round of AES decryption of the first m128i
+ using the second m128i as a round key. */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesdeclast_si128 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_aesdeclast128 ((__v2di)__X,
+ (__v2di)__Y);
+}
+
+/* Performs 1 round of AES encryption of the first m128i using
+ the second m128i as a round key. */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesenc_si128 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_aesenc128 ((__v2di)__X, (__v2di)__Y);
+}
+
+/* Performs the last round of AES encryption of the first m128i
+ using the second m128i as a round key. */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesenclast_si128 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) __builtin_ia32_aesenclast128 ((__v2di)__X, (__v2di)__Y);
+}
+
+/* Performs the InverseMixColumn operation on the source m128i
+ and stores the result into m128i destination. */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesimc_si128 (__m128i __X)
+{
+ return (__m128i) __builtin_ia32_aesimc128 ((__v2di)__X);
+}
+
+/* Generates a m128i round key for the input m128i AES cipher key and
+ byte round constant. The second parameter must be a compile time
+ constant. */
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aeskeygenassist_si128 (__m128i __X, const int __C)
+{
+ return (__m128i) __builtin_ia32_aeskeygenassist128 ((__v2di)__X, __C);
+}
+#else
+#define _mm_aeskeygenassist_si128(X, C) \
+ ((__m128i) __builtin_ia32_aeskeygenassist128 ((__v2di)(__m128i)(X), \
+ (int)(C)))
+#endif
+
+#ifdef __DISABLE_AES__
+#undef __DISABLE_AES__
+#pragma GCC pop_options
+#endif /* __DISABLE_AES__ */
+
+/* PCLMUL */
+
+#if !defined(__PCLMUL__) || !defined(__SSE2__)
+#pragma GCC push_options
+#pragma GCC target("pclmul,sse2")
+#define __DISABLE_PCLMUL__
+#endif /* __PCLMUL__ */
+
+/* Performs carry-less integer multiplication of 64-bit halves of
+ 128-bit input operands. The third parameter inducates which 64-bit
+ haves of the input parameters v1 and v2 should be used. It must be
+ a compile time constant. */
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_clmulepi64_si128 (__m128i __X, __m128i __Y, const int __I)
+{
+ return (__m128i) __builtin_ia32_pclmulqdq128 ((__v2di)__X,
+ (__v2di)__Y, __I);
+}
+#else
+#define _mm_clmulepi64_si128(X, Y, I) \
+ ((__m128i) __builtin_ia32_pclmulqdq128 ((__v2di)(__m128i)(X), \
+ (__v2di)(__m128i)(Y), (int)(I)))
+#endif
+
+#ifdef __DISABLE_PCLMUL__
+#undef __DISABLE_PCLMUL__
+#pragma GCC pop_options
+#endif /* __DISABLE_PCLMUL__ */
+
+#endif /* _WMMINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+#define _X86GPRINTRIN_H_INCLUDED
+
+#if !defined _SOFT_FLOAT || defined __MMX__ || defined __SSE__
+#pragma GCC push_options
+#pragma GCC target("general-regs-only")
+#define __DISABLE_GENERAL_REGS_ONLY__
+#endif
+
+#include <ia32intrin.h>
+
+#ifndef __iamcu__
+
+#include <stddef.h>
+
+#include <adxintrin.h>
+
+#include <bmiintrin.h>
+
+#include <bmi2intrin.h>
+
+#include <cetintrin.h>
+
+#include <cldemoteintrin.h>
+
+#include <clflushoptintrin.h>
+
+#include <clwbintrin.h>
+
+#include <clzerointrin.h>
+
+#include <cmpccxaddintrin.h>
+
+#include <enqcmdintrin.h>
+
+#include <fxsrintrin.h>
+
+#include <lzcntintrin.h>
+
+#include <lwpintrin.h>
+
+#include <movdirintrin.h>
+
+#include <mwaitintrin.h>
+
+#include <mwaitxintrin.h>
+
+#include <pconfigintrin.h>
+
+#include <popcntintrin.h>
+
+#include <pkuintrin.h>
+
+#include <prfchiintrin.h>
+
+#include <raointintrin.h>
+
+#include <rdseedintrin.h>
+
+#include <rtmintrin.h>
+
+#include <serializeintrin.h>
+
+#include <sgxintrin.h>
+
+#include <tbmintrin.h>
+
+#include <tsxldtrkintrin.h>
+
+#include <uintrintrin.h>
+
+#include <waitpkgintrin.h>
+
+#include <wbnoinvdintrin.h>
+
+#include <xsaveintrin.h>
+
+#include <xsavecintrin.h>
+
+#include <xsaveoptintrin.h>
+
+#include <xsavesintrin.h>
+
+#include <xtestintrin.h>
+
+#include <hresetintrin.h>
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_wbinvd (void)
+{
+ __builtin_ia32_wbinvd ();
+}
+
+#ifndef __RDRND__
+#pragma GCC push_options
+#pragma GCC target("rdrnd")
+#define __DISABLE_RDRND__
+#endif /* __RDRND__ */
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_rdrand16_step (unsigned short *__P)
+{
+ return __builtin_ia32_rdrand16_step (__P);
+}
+
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_rdrand32_step (unsigned int *__P)
+{
+ return __builtin_ia32_rdrand32_step (__P);
+}
+#ifdef __DISABLE_RDRND__
+#undef __DISABLE_RDRND__
+#pragma GCC pop_options
+#endif /* __DISABLE_RDRND__ */
+
+#ifndef __RDPID__
+#pragma GCC push_options
+#pragma GCC target("rdpid")
+#define __DISABLE_RDPID__
+#endif /* __RDPID__ */
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_rdpid_u32 (void)
+{
+ return __builtin_ia32_rdpid ();
+}
+#ifdef __DISABLE_RDPID__
+#undef __DISABLE_RDPID__
+#pragma GCC pop_options
+#endif /* __DISABLE_RDPID__ */
+
+#ifdef __x86_64__
+
+#ifndef __FSGSBASE__
+#pragma GCC push_options
+#pragma GCC target("fsgsbase")
+#define __DISABLE_FSGSBASE__
+#endif /* __FSGSBASE__ */
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_readfsbase_u32 (void)
+{
+ return __builtin_ia32_rdfsbase32 ();
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_readfsbase_u64 (void)
+{
+ return __builtin_ia32_rdfsbase64 ();
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_readgsbase_u32 (void)
+{
+ return __builtin_ia32_rdgsbase32 ();
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_readgsbase_u64 (void)
+{
+ return __builtin_ia32_rdgsbase64 ();
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_writefsbase_u32 (unsigned int __B)
+{
+ __builtin_ia32_wrfsbase32 (__B);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_writefsbase_u64 (unsigned long long __B)
+{
+ __builtin_ia32_wrfsbase64 (__B);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_writegsbase_u32 (unsigned int __B)
+{
+ __builtin_ia32_wrgsbase32 (__B);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_writegsbase_u64 (unsigned long long __B)
+{
+ __builtin_ia32_wrgsbase64 (__B);
+}
+#ifdef __DISABLE_FSGSBASE__
+#undef __DISABLE_FSGSBASE__
+#pragma GCC pop_options
+#endif /* __DISABLE_FSGSBASE__ */
+
+#ifndef __RDRND__
+#pragma GCC push_options
+#pragma GCC target("rdrnd")
+#define __DISABLE_RDRND__
+#endif /* __RDRND__ */
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_rdrand64_step (unsigned long long *__P)
+{
+ return __builtin_ia32_rdrand64_step (__P);
+}
+#ifdef __DISABLE_RDRND__
+#undef __DISABLE_RDRND__
+#pragma GCC pop_options
+#endif /* __DISABLE_RDRND__ */
+
+#endif /* __x86_64__ */
+
+#ifndef __PTWRITE__
+#pragma GCC push_options
+#pragma GCC target("ptwrite")
+#define __DISABLE_PTWRITE__
+#endif
+
+#ifdef __x86_64__
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_ptwrite64 (unsigned long long __B)
+{
+ __builtin_ia32_ptwrite64 (__B);
+}
+#endif /* __x86_64__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_ptwrite32 (unsigned __B)
+{
+ __builtin_ia32_ptwrite32 (__B);
+}
+#ifdef __DISABLE_PTWRITE__
+#undef __DISABLE_PTWRITE__
+#pragma GCC pop_options
+#endif /* __DISABLE_PTWRITE__ */
+
+#endif /* __iamcu__ */
+
+#ifdef __DISABLE_GENERAL_REGS_ONLY__
+#undef __DISABLE_GENERAL_REGS_ONLY__
+#pragma GCC pop_options
+#endif /* __DISABLE_GENERAL_REGS_ONLY__ */
+
+#endif /* _X86GPRINTRIN_H_INCLUDED. */
--- /dev/null
+/* Copyright (C) 2008-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86INTRIN_H_INCLUDED
+#define _X86INTRIN_H_INCLUDED
+
+#include <x86gprintrin.h>
+
+#ifndef __iamcu__
+
+/* For including AVX instructions */
+#include <immintrin.h>
+
+#include <mm3dnow.h>
+
+#include <fma4intrin.h>
+
+#include <xopintrin.h>
+
+#endif /* __iamcu__ */
+
+#endif /* _X86INTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2002-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+ User Guide and Reference, version 9.0. */
+
+#ifndef _XMMINTRIN_H_INCLUDED
+#define _XMMINTRIN_H_INCLUDED
+
+/* We need type definitions from the MMX header file. */
+#include <mmintrin.h>
+
+/* Get _mm_malloc () and _mm_free (). */
+#include <mm_malloc.h>
+
+/* Constants for use with _mm_prefetch. */
+enum _mm_hint
+{
+ _MM_HINT_IT0 = 19,
+ _MM_HINT_IT1 = 18,
+ /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */
+ _MM_HINT_ET0 = 7,
+ _MM_HINT_ET1 = 6,
+ _MM_HINT_T0 = 3,
+ _MM_HINT_T1 = 2,
+ _MM_HINT_T2 = 1,
+ _MM_HINT_NTA = 0
+};
+
+/* Loads one cache line from address P to a location "closer" to the
+ processor. The selector I specifies the type of prefetch operation. */
+#ifdef __OPTIMIZE__
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_prefetch (const void *__P, enum _mm_hint __I)
+{
+ __builtin_ia32_prefetch (__P, (__I & 0x4) >> 2,
+ __I & 0x3, (__I & 0x10) >> 4);
+}
+#else
+#define _mm_prefetch(P, I) \
+ __builtin_ia32_prefetch ((P), ((I) & 0x4) >> 2, ((I) & 0x3), ((I) & 0x10) >> 4)
+#endif
+
+#ifndef __SSE__
+#pragma GCC push_options
+#pragma GCC target("sse")
+#define __DISABLE_SSE__
+#endif /* __SSE__ */
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+ vector types, and their scalar components. */
+typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
+
+/* Unaligned version of the same type. */
+typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
+
+/* Internal data types for implementing the intrinsics. */
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+/* Create a selector for use with the SHUFPS instruction. */
+#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
+ (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
+
+/* Bits in the MXCSR. */
+#define _MM_EXCEPT_MASK 0x003f
+#define _MM_EXCEPT_INVALID 0x0001
+#define _MM_EXCEPT_DENORM 0x0002
+#define _MM_EXCEPT_DIV_ZERO 0x0004
+#define _MM_EXCEPT_OVERFLOW 0x0008
+#define _MM_EXCEPT_UNDERFLOW 0x0010
+#define _MM_EXCEPT_INEXACT 0x0020
+
+#define _MM_MASK_MASK 0x1f80
+#define _MM_MASK_INVALID 0x0080
+#define _MM_MASK_DENORM 0x0100
+#define _MM_MASK_DIV_ZERO 0x0200
+#define _MM_MASK_OVERFLOW 0x0400
+#define _MM_MASK_UNDERFLOW 0x0800
+#define _MM_MASK_INEXACT 0x1000
+
+#define _MM_ROUND_MASK 0x6000
+#define _MM_ROUND_NEAREST 0x0000
+#define _MM_ROUND_DOWN 0x2000
+#define _MM_ROUND_UP 0x4000
+#define _MM_ROUND_TOWARD_ZERO 0x6000
+
+#define _MM_FLUSH_ZERO_MASK 0x8000
+#define _MM_FLUSH_ZERO_ON 0x8000
+#define _MM_FLUSH_ZERO_OFF 0x0000
+
+/* Create an undefined vector. */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_undefined_ps (void)
+{
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Winit-self"
+ __m128 __Y = __Y;
+#pragma GCC diagnostic pop
+ return __Y;
+}
+
+/* Create a vector of zeros. */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setzero_ps (void)
+{
+ return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
+}
+
+/* Perform the respective operation on the lower SPFP (single-precision
+ floating-point) values of A and B; the upper three SPFP values are
+ passed through from A. */
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_ss (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_ss (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_ss (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_ss (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_ss (__m128 __A)
+{
+ return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp_ss (__m128 __A)
+{
+ return (__m128) __builtin_ia32_rcpss ((__v4sf)__A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt_ss (__m128 __A)
+{
+ return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_ss (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_ss (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
+}
+
+/* Perform the respective operation on the four SPFP values in A and B. */
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_ps (__m128 __A, __m128 __B)
+{
+ return (__m128) ((__v4sf)__A + (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_ps (__m128 __A, __m128 __B)
+{
+ return (__m128) ((__v4sf)__A - (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_ps (__m128 __A, __m128 __B)
+{
+ return (__m128) ((__v4sf)__A * (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_ps (__m128 __A, __m128 __B)
+{
+ return (__m128) ((__v4sf)__A / (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_ps (__m128 __A)
+{
+ return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp_ps (__m128 __A)
+{
+ return (__m128) __builtin_ia32_rcpps ((__v4sf)__A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt_ps (__m128 __A)
+{
+ return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_ps (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_ps (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B);
+}
+
+/* Perform logical bit-wise operations on 128-bit values. */
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_and_ps (__m128 __A, __m128 __B)
+{
+ return __builtin_ia32_andps (__A, __B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_andnot_ps (__m128 __A, __m128 __B)
+{
+ return __builtin_ia32_andnps (__A, __B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_or_ps (__m128 __A, __m128 __B)
+{
+ return __builtin_ia32_orps (__A, __B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_xor_ps (__m128 __A, __m128 __B)
+{
+ return __builtin_ia32_xorps (__A, __B);
+}
+
+/* Perform a comparison on the lower SPFP values of A and B. If the
+ comparison is true, place a mask of all ones in the result, otherwise a
+ mask of zeros. The upper three SPFP values are passed through from A. */
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_ss (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_ss (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_ss (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_ss (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_movss ((__v4sf) __A,
+ (__v4sf)
+ __builtin_ia32_cmpltss ((__v4sf) __B,
+ (__v4sf)
+ __A));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_ss (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_movss ((__v4sf) __A,
+ (__v4sf)
+ __builtin_ia32_cmpless ((__v4sf) __B,
+ (__v4sf)
+ __A));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_ss (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnlt_ss (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnle_ss (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpngt_ss (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_movss ((__v4sf) __A,
+ (__v4sf)
+ __builtin_ia32_cmpnltss ((__v4sf) __B,
+ (__v4sf)
+ __A));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnge_ss (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_movss ((__v4sf) __A,
+ (__v4sf)
+ __builtin_ia32_cmpnless ((__v4sf) __B,
+ (__v4sf)
+ __A));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpord_ss (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpunord_ss (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B);
+}
+
+/* Perform a comparison on the four SPFP values of A and B. For each
+ element, if the comparison is true, place a mask of all ones in the
+ result, otherwise a mask of zeros. */
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_ps (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_ps (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_ps (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_ps (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_ps (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_ps (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnlt_ps (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnle_ps (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpngt_ps (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnge_ps (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpord_ps (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpunord_ps (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B);
+}
+
+/* Compare the lower SPFP values of A and B and return 1 if true
+ and 0 if false. */
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comieq_ss (__m128 __A, __m128 __B)
+{
+ return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comilt_ss (__m128 __A, __m128 __B)
+{
+ return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comile_ss (__m128 __A, __m128 __B)
+{
+ return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comigt_ss (__m128 __A, __m128 __B)
+{
+ return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comige_ss (__m128 __A, __m128 __B)
+{
+ return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comineq_ss (__m128 __A, __m128 __B)
+{
+ return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomieq_ss (__m128 __A, __m128 __B)
+{
+ return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomilt_ss (__m128 __A, __m128 __B)
+{
+ return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomile_ss (__m128 __A, __m128 __B)
+{
+ return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomigt_ss (__m128 __A, __m128 __B)
+{
+ return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomige_ss (__m128 __A, __m128 __B)
+{
+ return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomineq_ss (__m128 __A, __m128 __B)
+{
+ return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B);
+}
+
+/* Convert the lower SPFP value to a 32-bit integer according to the current
+ rounding mode. */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_si32 (__m128 __A)
+{
+ return __builtin_ia32_cvtss2si ((__v4sf) __A);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_ss2si (__m128 __A)
+{
+ return _mm_cvtss_si32 (__A);
+}
+
+#ifdef __x86_64__
+/* Convert the lower SPFP value to a 32-bit integer according to the
+ current rounding mode. */
+
+/* Intel intrinsic. */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_si64 (__m128 __A)
+{
+ return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
+}
+
+/* Microsoft intrinsic. */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_si64x (__m128 __A)
+{
+ return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
+}
+#endif
+
+/* Convert the two lower SPFP values to 32-bit integers according to the
+ current rounding mode. Return the integers in packed form. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_pi32 (__m128 __A)
+{
+ return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_ps2pi (__m128 __A)
+{
+ return _mm_cvtps_pi32 (__A);
+}
+
+/* Truncate the lower SPFP value to a 32-bit integer. */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttss_si32 (__m128 __A)
+{
+ return __builtin_ia32_cvttss2si ((__v4sf) __A);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_ss2si (__m128 __A)
+{
+ return _mm_cvttss_si32 (__A);
+}
+
+#ifdef __x86_64__
+/* Truncate the lower SPFP value to a 32-bit integer. */
+
+/* Intel intrinsic. */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttss_si64 (__m128 __A)
+{
+ return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
+}
+
+/* Microsoft intrinsic. */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttss_si64x (__m128 __A)
+{
+ return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
+}
+#endif
+
+/* Truncate the two lower SPFP values to 32-bit integers. Return the
+ integers in packed form. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttps_pi32 (__m128 __A)
+{
+ return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_ps2pi (__m128 __A)
+{
+ return _mm_cvttps_pi32 (__A);
+}
+
+/* Convert B to a SPFP value and insert it as element zero in A. */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi32_ss (__m128 __A, int __B)
+{
+ return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_si2ss (__m128 __A, int __B)
+{
+ return _mm_cvtsi32_ss (__A, __B);
+}
+
+#ifdef __x86_64__
+/* Convert B to a SPFP value and insert it as element zero in A. */
+
+/* Intel intrinsic. */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64_ss (__m128 __A, long long __B)
+{
+ return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
+}
+
+/* Microsoft intrinsic. */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64x_ss (__m128 __A, long long __B)
+{
+ return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
+}
+#endif
+
+/* Convert the two 32-bit values in B to SPFP form and insert them
+ as the two lower elements in A. */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpi32_ps (__m128 __A, __m64 __B)
+{
+ return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_pi2ps (__m128 __A, __m64 __B)
+{
+ return _mm_cvtpi32_ps (__A, __B);
+}
+
+/* Convert the four signed 16-bit values in A to SPFP form. */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpi16_ps (__m64 __A)
+{
+ __v4hi __sign;
+ __v2si __hisi, __losi;
+ __v4sf __zero, __ra, __rb;
+
+ /* This comparison against zero gives us a mask that can be used to
+ fill in the missing sign bits in the unpack operations below, so
+ that we get signed values after unpacking. */
+ __sign = __builtin_ia32_pcmpgtw ((__v4hi)0LL, (__v4hi)__A);
+
+ /* Convert the four words to doublewords. */
+ __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign);
+ __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign);
+
+ /* Convert the doublewords to floating point two at a time. */
+ __zero = (__v4sf) _mm_setzero_ps ();
+ __ra = __builtin_ia32_cvtpi2ps (__zero, __losi);
+ __rb = __builtin_ia32_cvtpi2ps (__ra, __hisi);
+
+ return (__m128) __builtin_ia32_movlhps (__ra, __rb);
+}
+
+/* Convert the four unsigned 16-bit values in A to SPFP form. */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpu16_ps (__m64 __A)
+{
+ __v2si __hisi, __losi;
+ __v4sf __zero, __ra, __rb;
+
+ /* Convert the four words to doublewords. */
+ __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, (__v4hi)0LL);
+ __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, (__v4hi)0LL);
+
+ /* Convert the doublewords to floating point two at a time. */
+ __zero = (__v4sf) _mm_setzero_ps ();
+ __ra = __builtin_ia32_cvtpi2ps (__zero, __losi);
+ __rb = __builtin_ia32_cvtpi2ps (__ra, __hisi);
+
+ return (__m128) __builtin_ia32_movlhps (__ra, __rb);
+}
+
+/* Convert the low four signed 8-bit values in A to SPFP form. */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpi8_ps (__m64 __A)
+{
+ __v8qi __sign;
+
+ /* This comparison against zero gives us a mask that can be used to
+ fill in the missing sign bits in the unpack operations below, so
+ that we get signed values after unpacking. */
+ __sign = __builtin_ia32_pcmpgtb ((__v8qi)0LL, (__v8qi)__A);
+
+ /* Convert the four low bytes to words. */
+ __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign);
+
+ return _mm_cvtpi16_ps(__A);
+}
+
+/* Convert the low four unsigned 8-bit values in A to SPFP form. */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpu8_ps(__m64 __A)
+{
+ __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, (__v8qi)0LL);
+ return _mm_cvtpu16_ps(__A);
+}
+
+/* Convert the four signed 32-bit values in A and B to SPFP form. */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
+{
+ __v4sf __zero = (__v4sf) _mm_setzero_ps ();
+ __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A);
+ __v4sf __sfb = __builtin_ia32_cvtpi2ps (__sfa, (__v2si)__B);
+ return (__m128) __builtin_ia32_movlhps (__sfa, __sfb);
+}
+
+/* Convert the four SPFP values in A to four signed 16-bit integers. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_pi16(__m128 __A)
+{
+ __v4sf __hisf = (__v4sf)__A;
+ __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf);
+ __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf);
+ __v2si __losi = __builtin_ia32_cvtps2pi (__losf);
+ return (__m64) __builtin_ia32_packssdw (__hisi, __losi);
+}
+
+/* Convert the four SPFP values in A to four signed 8-bit integers. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_pi8(__m128 __A)
+{
+ __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A);
+ return (__m64) __builtin_ia32_packsswb (__tmp, (__v4hi)0LL);
+}
+
+/* Selects four specific SPFP values from A and B based on MASK. */
+#ifdef __OPTIMIZE__
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask)
+{
+ return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask);
+}
+#else
+#define _mm_shuffle_ps(A, B, MASK) \
+ ((__m128) __builtin_ia32_shufps ((__v4sf)(__m128)(A), \
+ (__v4sf)(__m128)(B), (int)(MASK)))
+#endif
+
+/* Selects and interleaves the upper two SPFP values from A and B. */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_ps (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B);
+}
+
+/* Selects and interleaves the lower two SPFP values from A and B. */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_ps (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B);
+}
+
+/* Sets the upper two SPFP values with 64-bits of data loaded from P;
+ the lower two values are passed through from A. */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadh_pi (__m128 __A, __m64 const *__P)
+{
+ return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (const __v2sf *)__P);
+}
+
+/* Stores the upper two SPFP values of A into P. */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeh_pi (__m64 *__P, __m128 __A)
+{
+ __builtin_ia32_storehps ((__v2sf *)__P, (__v4sf)__A);
+}
+
+/* Moves the upper two values of B into the lower two values of A. */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movehl_ps (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B);
+}
+
+/* Moves the lower two values of B into the upper two values of A. */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movelh_ps (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B);
+}
+
+/* Sets the lower two SPFP values with 64-bits of data loaded from P;
+ the upper two values are passed through from A. */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadl_pi (__m128 __A, __m64 const *__P)
+{
+ return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (const __v2sf *)__P);
+}
+
+/* Stores the lower two SPFP values of A into P. */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storel_pi (__m64 *__P, __m128 __A)
+{
+ __builtin_ia32_storelps ((__v2sf *)__P, (__v4sf)__A);
+}
+
+/* Creates a 4-bit mask from the most significant bits of the SPFP values. */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movemask_ps (__m128 __A)
+{
+ return __builtin_ia32_movmskps ((__v4sf)__A);
+}
+
+/* Return the contents of the control register. */
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getcsr (void)
+{
+ return __builtin_ia32_stmxcsr ();
+}
+
+/* Read exception bits from the control register. */
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_MM_GET_EXCEPTION_STATE (void)
+{
+ return _mm_getcsr() & _MM_EXCEPT_MASK;
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_MM_GET_EXCEPTION_MASK (void)
+{
+ return _mm_getcsr() & _MM_MASK_MASK;
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_MM_GET_ROUNDING_MODE (void)
+{
+ return _mm_getcsr() & _MM_ROUND_MASK;
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_MM_GET_FLUSH_ZERO_MODE (void)
+{
+ return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
+}
+
+/* Set the control register to I. */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setcsr (unsigned int __I)
+{
+ __builtin_ia32_ldmxcsr (__I);
+}
+
+/* Set exception bits in the control register. */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_MM_SET_EXCEPTION_STATE(unsigned int __mask)
+{
+ _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_MM_SET_EXCEPTION_MASK (unsigned int __mask)
+{
+ _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_MM_SET_ROUNDING_MODE (unsigned int __mode)
+{
+ _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_MM_SET_FLUSH_ZERO_MODE (unsigned int __mode)
+{
+ _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode);
+}
+
+/* Create a vector with element 0 as F and the rest zero. */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_ss (float __F)
+{
+ return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
+}
+
+/* Create a vector with all four elements equal to F. */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_ps (float __F)
+{
+ return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_ps1 (float __F)
+{
+ return _mm_set1_ps (__F);
+}
+
+/* Create a vector with element 0 as *P and the rest zero. */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_ss (float const *__P)
+{
+ return _mm_set_ss (*__P);
+}
+
+/* Create a vector with all four elements equal to *P. */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load1_ps (float const *__P)
+{
+ return _mm_set1_ps (*__P);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_ps1 (float const *__P)
+{
+ return _mm_load1_ps (__P);
+}
+
+/* Load four SPFP values from P. The address must be 16-byte aligned. */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_ps (float const *__P)
+{
+ return *(__m128 *)__P;
+}
+
+/* Load four SPFP values from P. The address need not be 16-byte aligned. */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_ps (float const *__P)
+{
+ return *(__m128_u *)__P;
+}
+
+/* Load four SPFP values in reverse order. The address must be aligned. */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadr_ps (float const *__P)
+{
+ __v4sf __tmp = *(__v4sf *)__P;
+ return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3));
+}
+
+/* Create the vector [Z Y X W]. */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
+{
+ return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
+}
+
+/* Create the vector [W X Y Z]. */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_ps (float __Z, float __Y, float __X, float __W)
+{
+ return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
+}
+
+/* Stores the lower SPFP value. */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_ss (float *__P, __m128 __A)
+{
+ *__P = ((__v4sf)__A)[0];
+}
+
+extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_f32 (__m128 __A)
+{
+ return ((__v4sf)__A)[0];
+}
+
+/* Store four SPFP values. The address must be 16-byte aligned. */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_ps (float *__P, __m128 __A)
+{
+ *(__m128 *)__P = __A;
+}
+
+/* Store four SPFP values. The address need not be 16-byte aligned. */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_ps (float *__P, __m128 __A)
+{
+ *(__m128_u *)__P = __A;
+}
+
+/* Store the lower SPFP value across four words. */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store1_ps (float *__P, __m128 __A)
+{
+ __v4sf __va = (__v4sf)__A;
+ __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0));
+ _mm_storeu_ps (__P, __tmp);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_ps1 (float *__P, __m128 __A)
+{
+ _mm_store1_ps (__P, __A);
+}
+
+/* Store four SPFP values in reverse order. The address must be aligned. */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storer_ps (float *__P, __m128 __A)
+{
+ __v4sf __va = (__v4sf)__A;
+ __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3));
+ _mm_store_ps (__P, __tmp);
+}
+
+/* Sets the low SPFP value of A from the low value of B. */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_move_ss (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_shuffle ((__v4sf)__A, (__v4sf)__B,
+ __extension__
+ (__attribute__((__vector_size__ (16))) int)
+ {4,1,2,3});
+}
+
+/* Extracts one of the four words of A. The selector N must be immediate. */
+#ifdef __OPTIMIZE__
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_pi16 (__m64 const __A, int const __N)
+{
+ return (unsigned short) __builtin_ia32_vec_ext_v4hi ((__v4hi)__A, __N);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pextrw (__m64 const __A, int const __N)
+{
+ return _mm_extract_pi16 (__A, __N);
+}
+#else
+#define _mm_extract_pi16(A, N) \
+ ((int) (unsigned short) __builtin_ia32_vec_ext_v4hi ((__v4hi)(__m64)(A), (int)(N)))
+
+#define _m_pextrw(A, N) _mm_extract_pi16(A, N)
+#endif
+
+/* Inserts word D into one of four words of A. The selector N must be
+ immediate. */
+#ifdef __OPTIMIZE__
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
+{
+ return (__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)__A, __D, __N);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pinsrw (__m64 const __A, int const __D, int const __N)
+{
+ return _mm_insert_pi16 (__A, __D, __N);
+}
+#else
+#define _mm_insert_pi16(A, D, N) \
+ ((__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)(__m64)(A), \
+ (int)(D), (int)(N)))
+
+#define _m_pinsrw(A, D, N) _mm_insert_pi16(A, D, N)
+#endif
+
+/* Compute the element-wise maximum of signed 16-bit values. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_pi16 (__m64 __A, __m64 __B)
+{
+ return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmaxsw (__m64 __A, __m64 __B)
+{
+ return _mm_max_pi16 (__A, __B);
+}
+
+/* Compute the element-wise maximum of unsigned 8-bit values. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_pu8 (__m64 __A, __m64 __B)
+{
+ return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmaxub (__m64 __A, __m64 __B)
+{
+ return _mm_max_pu8 (__A, __B);
+}
+
+/* Compute the element-wise minimum of signed 16-bit values. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_pi16 (__m64 __A, __m64 __B)
+{
+ return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pminsw (__m64 __A, __m64 __B)
+{
+ return _mm_min_pi16 (__A, __B);
+}
+
+/* Compute the element-wise minimum of unsigned 8-bit values. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_pu8 (__m64 __A, __m64 __B)
+{
+ return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pminub (__m64 __A, __m64 __B)
+{
+ return _mm_min_pu8 (__A, __B);
+}
+
+/* Create an 8-bit mask of the signs of 8-bit values. */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movemask_pi8 (__m64 __A)
+{
+ return __builtin_ia32_pmovmskb ((__v8qi)__A);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmovmskb (__m64 __A)
+{
+ return _mm_movemask_pi8 (__A);
+}
+
+/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
+ in B and produce the high 16 bits of the 32-bit results. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhi_pu16 (__m64 __A, __m64 __B)
+{
+ return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmulhuw (__m64 __A, __m64 __B)
+{
+ return _mm_mulhi_pu16 (__A, __B);
+}
+
+/* Return a combination of the four 16-bit values in A. The selector
+ must be an immediate. */
+#ifdef __OPTIMIZE__
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_pi16 (__m64 __A, int const __N)
+{
+ return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pshufw (__m64 __A, int const __N)
+{
+ return _mm_shuffle_pi16 (__A, __N);
+}
+#else
+#define _mm_shuffle_pi16(A, N) \
+ ((__m64) __builtin_ia32_pshufw ((__v4hi)(__m64)(A), (int)(N)))
+
+#define _m_pshufw(A, N) _mm_shuffle_pi16 (A, N)
+#endif
+
+/* Conditionally store byte elements of A into P. The high bit of each
+ byte in the selector N determines whether the corresponding byte from
+ A is stored. */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
+{
+#ifdef __MMX_WITH_SSE__
+ /* Emulate MMX maskmovq with SSE2 maskmovdqu and handle unmapped bits
+ 64:127 at address __P. */
+ typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+ typedef char __v16qi __attribute__ ((__vector_size__ (16)));
+ /* Zero-extend __A and __N to 128 bits. */
+ __v2di __A128 = __extension__ (__v2di) { ((__v1di) __A)[0], 0 };
+ __v2di __N128 = __extension__ (__v2di) { ((__v1di) __N)[0], 0 };
+
+ /* Check the alignment of __P. */
+ __SIZE_TYPE__ offset = ((__SIZE_TYPE__) __P) & 0xf;
+ if (offset)
+ {
+ /* If the misalignment of __P > 8, subtract __P by 8 bytes.
+ Otherwise, subtract __P by the misalignment. */
+ if (offset > 8)
+ offset = 8;
+ __P = (char *) (((__SIZE_TYPE__) __P) - offset);
+
+ /* Shift __A128 and __N128 to the left by the adjustment. */
+ switch (offset)
+ {
+ case 1:
+ __A128 = __builtin_ia32_pslldqi128 (__A128, 8);
+ __N128 = __builtin_ia32_pslldqi128 (__N128, 8);
+ break;
+ case 2:
+ __A128 = __builtin_ia32_pslldqi128 (__A128, 2 * 8);
+ __N128 = __builtin_ia32_pslldqi128 (__N128, 2 * 8);
+ break;
+ case 3:
+ __A128 = __builtin_ia32_pslldqi128 (__A128, 3 * 8);
+ __N128 = __builtin_ia32_pslldqi128 (__N128, 3 * 8);
+ break;
+ case 4:
+ __A128 = __builtin_ia32_pslldqi128 (__A128, 4 * 8);
+ __N128 = __builtin_ia32_pslldqi128 (__N128, 4 * 8);
+ break;
+ case 5:
+ __A128 = __builtin_ia32_pslldqi128 (__A128, 5 * 8);
+ __N128 = __builtin_ia32_pslldqi128 (__N128, 5 * 8);
+ break;
+ case 6:
+ __A128 = __builtin_ia32_pslldqi128 (__A128, 6 * 8);
+ __N128 = __builtin_ia32_pslldqi128 (__N128, 6 * 8);
+ break;
+ case 7:
+ __A128 = __builtin_ia32_pslldqi128 (__A128, 7 * 8);
+ __N128 = __builtin_ia32_pslldqi128 (__N128, 7 * 8);
+ break;
+ case 8:
+ __A128 = __builtin_ia32_pslldqi128 (__A128, 8 * 8);
+ __N128 = __builtin_ia32_pslldqi128 (__N128, 8 * 8);
+ break;
+ default:
+ break;
+ }
+ }
+ __builtin_ia32_maskmovdqu ((__v16qi)__A128, (__v16qi)__N128, __P);
+#else
+ __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
+#endif
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_maskmovq (__m64 __A, __m64 __N, char *__P)
+{
+ _mm_maskmove_si64 (__A, __N, __P);
+}
+
+/* Compute the rounded averages of the unsigned 8-bit values in A and B. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avg_pu8 (__m64 __A, __m64 __B)
+{
+ return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pavgb (__m64 __A, __m64 __B)
+{
+ return _mm_avg_pu8 (__A, __B);
+}
+
+/* Compute the rounded averages of the unsigned 16-bit values in A and B. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avg_pu16 (__m64 __A, __m64 __B)
+{
+ return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pavgw (__m64 __A, __m64 __B)
+{
+ return _mm_avg_pu16 (__A, __B);
+}
+
+/* Compute the sum of the absolute differences of the unsigned 8-bit
+ values in A and B. Return the value in the lower 16-bit word; the
+ upper words are cleared. */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sad_pu8 (__m64 __A, __m64 __B)
+{
+ return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psadbw (__m64 __A, __m64 __B)
+{
+ return _mm_sad_pu8 (__A, __B);
+}
+
+/* Stores the data in A to the address P without polluting the caches. */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_pi (__m64 *__P, __m64 __A)
+{
+ __builtin_ia32_movntq ((unsigned long long *)__P, (unsigned long long)__A);
+}
+
+/* Likewise. The address must be 16-byte aligned. */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_ps (float *__P, __m128 __A)
+{
+ __builtin_ia32_movntps (__P, (__v4sf)__A);
+}
+
+/* Guarantees that every preceding store is globally visible before
+ any subsequent store. */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sfence (void)
+{
+ __builtin_ia32_sfence ();
+}
+
+/* Transpose the 4x4 matrix composed of row[0-3]. */
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
+do { \
+ __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
+ __v4sf __t0 = __builtin_ia32_unpcklps (__r0, __r1); \
+ __v4sf __t1 = __builtin_ia32_unpcklps (__r2, __r3); \
+ __v4sf __t2 = __builtin_ia32_unpckhps (__r0, __r1); \
+ __v4sf __t3 = __builtin_ia32_unpckhps (__r2, __r3); \
+ (row0) = __builtin_ia32_movlhps (__t0, __t1); \
+ (row1) = __builtin_ia32_movhlps (__t1, __t0); \
+ (row2) = __builtin_ia32_movlhps (__t2, __t3); \
+ (row3) = __builtin_ia32_movhlps (__t3, __t2); \
+} while (0)
+
+/* For backward source compatibility. */
+# include <emmintrin.h>
+
+#ifdef __DISABLE_SSE__
+#undef __DISABLE_SSE__
+#pragma GCC pop_options
+#endif /* __DISABLE_SSE__ */
+
+/* The execution of the next instruction is delayed by an implementation
+ specific amount of time. The instruction does not modify the
+ architectural state. This is after the pop_options pragma because
+ it does not require SSE support in the processor--the encoding is a
+ nop on processors that do not support it. */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_pause (void)
+{
+ __builtin_ia32_pause ();
+}
+
+#endif /* _XMMINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2007-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86INTRIN_H_INCLUDED
+# error "Never use <xopintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _XOPMMINTRIN_H_INCLUDED
+#define _XOPMMINTRIN_H_INCLUDED
+
+#include <fma4intrin.h>
+
+#ifndef __XOP__
+#pragma GCC push_options
+#pragma GCC target("xop")
+#define __DISABLE_XOP__
+#endif /* __XOP__ */
+
+/* Integer multiply/add instructions. */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i) __builtin_ia32_vpmacssww ((__v8hi)__A,(__v8hi)__B, (__v8hi)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_macc_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i) __builtin_ia32_vpmacsww ((__v8hi)__A, (__v8hi)__B, (__v8hi)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maccsd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i) __builtin_ia32_vpmacsswd ((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maccd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i) __builtin_ia32_vpmacswd ((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maccs_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i) __builtin_ia32_vpmacssdd ((__v4si)__A, (__v4si)__B, (__v4si)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_macc_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i) __builtin_ia32_vpmacsdd ((__v4si)__A, (__v4si)__B, (__v4si)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maccslo_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i) __builtin_ia32_vpmacssdql ((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_macclo_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i) __builtin_ia32_vpmacsdql ((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maccshi_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i) __builtin_ia32_vpmacssdqh ((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_macchi_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i) __builtin_ia32_vpmacsdqh ((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maddsd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i) __builtin_ia32_vpmadcsswd ((__v8hi)__A,(__v8hi)__B,(__v4si)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maddd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i) __builtin_ia32_vpmadcswd ((__v8hi)__A,(__v8hi)__B,(__v4si)__C);
+}
+
+/* Packed Integer Horizontal Add and Subtract */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddw_epi8(__m128i __A)
+{
+ return (__m128i) __builtin_ia32_vphaddbw ((__v16qi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddd_epi8(__m128i __A)
+{
+ return (__m128i) __builtin_ia32_vphaddbd ((__v16qi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddq_epi8(__m128i __A)
+{
+ return (__m128i) __builtin_ia32_vphaddbq ((__v16qi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddd_epi16(__m128i __A)
+{
+ return (__m128i) __builtin_ia32_vphaddwd ((__v8hi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddq_epi16(__m128i __A)
+{
+ return (__m128i) __builtin_ia32_vphaddwq ((__v8hi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddq_epi32(__m128i __A)
+{
+ return (__m128i) __builtin_ia32_vphadddq ((__v4si)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddw_epu8(__m128i __A)
+{
+ return (__m128i) __builtin_ia32_vphaddubw ((__v16qi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddd_epu8(__m128i __A)
+{
+ return (__m128i) __builtin_ia32_vphaddubd ((__v16qi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddq_epu8(__m128i __A)
+{
+ return (__m128i) __builtin_ia32_vphaddubq ((__v16qi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddd_epu16(__m128i __A)
+{
+ return (__m128i) __builtin_ia32_vphadduwd ((__v8hi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddq_epu16(__m128i __A)
+{
+ return (__m128i) __builtin_ia32_vphadduwq ((__v8hi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddq_epu32(__m128i __A)
+{
+ return (__m128i) __builtin_ia32_vphaddudq ((__v4si)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsubw_epi8(__m128i __A)
+{
+ return (__m128i) __builtin_ia32_vphsubbw ((__v16qi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsubd_epi16(__m128i __A)
+{
+ return (__m128i) __builtin_ia32_vphsubwd ((__v8hi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsubq_epi32(__m128i __A)
+{
+ return (__m128i) __builtin_ia32_vphsubdq ((__v4si)__A);
+}
+
+/* Vector conditional move and permute */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i) __builtin_ia32_vpcmov (__A, __B, __C);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C)
+{
+ return (__m256i) __builtin_ia32_vpcmov256 (__A, __B, __C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_perm_epi8(__m128i __A, __m128i __B, __m128i __C)
+{
+ return (__m128i) __builtin_ia32_vpperm ((__v16qi)__A, (__v16qi)__B, (__v16qi)__C);
+}
+
+/* Packed Integer Rotates and Shifts
+ Rotates - Non-Immediate form */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rot_epi8(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vprotb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rot_epi16(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vprotw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rot_epi32(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vprotd ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rot_epi64(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vprotq ((__v2di)__A, (__v2di)__B);
+}
+
+/* Rotates - Immediate form */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roti_epi8(__m128i __A, const int __B)
+{
+ return (__m128i) __builtin_ia32_vprotbi ((__v16qi)__A, __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roti_epi16(__m128i __A, const int __B)
+{
+ return (__m128i) __builtin_ia32_vprotwi ((__v8hi)__A, __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roti_epi32(__m128i __A, const int __B)
+{
+ return (__m128i) __builtin_ia32_vprotdi ((__v4si)__A, __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roti_epi64(__m128i __A, const int __B)
+{
+ return (__m128i) __builtin_ia32_vprotqi ((__v2di)__A, __B);
+}
+#else
+#define _mm_roti_epi8(A, N) \
+ ((__m128i) __builtin_ia32_vprotbi ((__v16qi)(__m128i)(A), (int)(N)))
+#define _mm_roti_epi16(A, N) \
+ ((__m128i) __builtin_ia32_vprotwi ((__v8hi)(__m128i)(A), (int)(N)))
+#define _mm_roti_epi32(A, N) \
+ ((__m128i) __builtin_ia32_vprotdi ((__v4si)(__m128i)(A), (int)(N)))
+#define _mm_roti_epi64(A, N) \
+ ((__m128i) __builtin_ia32_vprotqi ((__v2di)(__m128i)(A), (int)(N)))
+#endif
+
+/* Shifts */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shl_epi8(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpshlb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shl_epi16(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpshlw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shl_epi32(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpshld ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shl_epi64(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpshlq ((__v2di)__A, (__v2di)__B);
+}
+
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha_epi8(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpshab ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha_epi16(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpshaw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha_epi32(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpshad ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha_epi64(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpshaq ((__v2di)__A, (__v2di)__B);
+}
+
+/* Compare and Predicate Generation
+ pcom (integer, unsigned bytes) */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comlt_epu8(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomltub ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comle_epu8(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomleub ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comgt_epu8(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomgtub ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comge_epu8(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomgeub ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comeq_epu8(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomequb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comneq_epu8(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomnequb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comfalse_epu8(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomfalseub ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comtrue_epu8(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomtrueub ((__v16qi)__A, (__v16qi)__B);
+}
+
+/*pcom (integer, unsigned words) */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comlt_epu16(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomltuw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comle_epu16(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomleuw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comgt_epu16(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomgtuw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comge_epu16(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomgeuw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comeq_epu16(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomequw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comneq_epu16(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomnequw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comfalse_epu16(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomfalseuw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comtrue_epu16(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomtrueuw ((__v8hi)__A, (__v8hi)__B);
+}
+
+/*pcom (integer, unsigned double words) */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comlt_epu32(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomltud ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comle_epu32(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomleud ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comgt_epu32(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomgtud ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comge_epu32(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomgeud ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comeq_epu32(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomequd ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comneq_epu32(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomnequd ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comfalse_epu32(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomfalseud ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comtrue_epu32(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomtrueud ((__v4si)__A, (__v4si)__B);
+}
+
+/*pcom (integer, unsigned quad words) */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comlt_epu64(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomltuq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comle_epu64(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomleuq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comgt_epu64(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomgtuq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comge_epu64(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomgeuq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comeq_epu64(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomequq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comneq_epu64(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomnequq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comfalse_epu64(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomfalseuq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comtrue_epu64(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomtrueuq ((__v2di)__A, (__v2di)__B);
+}
+
+/*pcom (integer, signed bytes) */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comlt_epi8(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomltb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comle_epi8(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomleb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comgt_epi8(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomgtb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comge_epi8(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomgeb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comeq_epi8(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomeqb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comneq_epi8(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomneqb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comfalse_epi8(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomfalseb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comtrue_epi8(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomtrueb ((__v16qi)__A, (__v16qi)__B);
+}
+
+/*pcom (integer, signed words) */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comlt_epi16(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomltw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comle_epi16(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomlew ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comgt_epi16(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomgtw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comge_epi16(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomgew ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comeq_epi16(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomeqw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comneq_epi16(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomneqw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comfalse_epi16(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomfalsew ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comtrue_epi16(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomtruew ((__v8hi)__A, (__v8hi)__B);
+}
+
+/*pcom (integer, signed double words) */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comlt_epi32(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomltd ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comle_epi32(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomled ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comgt_epi32(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomgtd ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comge_epi32(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomged ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comeq_epi32(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomeqd ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comneq_epi32(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomneqd ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comfalse_epi32(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomfalsed ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comtrue_epi32(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomtrued ((__v4si)__A, (__v4si)__B);
+}
+
+/*pcom (integer, signed quad words) */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comlt_epi64(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomltq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comle_epi64(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomleq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comgt_epi64(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomgtq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comge_epi64(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomgeq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comeq_epi64(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomeqq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comneq_epi64(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomneqq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comfalse_epi64(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomfalseq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comtrue_epi64(__m128i __A, __m128i __B)
+{
+ return (__m128i) __builtin_ia32_vpcomtrueq ((__v2di)__A, (__v2di)__B);
+}
+
+/* FRCZ */
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_frcz_ps (__m128 __A)
+{
+ return (__m128) __builtin_ia32_vfrczps ((__v4sf)__A);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_frcz_pd (__m128d __A)
+{
+ return (__m128d) __builtin_ia32_vfrczpd ((__v2df)__A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_frcz_ss (__m128 __A, __m128 __B)
+{
+ return (__m128) __builtin_ia32_movss ((__v4sf)__A,
+ (__v4sf)
+ __builtin_ia32_vfrczss ((__v4sf)__B));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_frcz_sd (__m128d __A, __m128d __B)
+{
+ return (__m128d) __builtin_ia32_movsd ((__v2df)__A,
+ (__v2df)
+ __builtin_ia32_vfrczsd ((__v2df)__B));
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_frcz_ps (__m256 __A)
+{
+ return (__m256) __builtin_ia32_vfrczps256 ((__v8sf)__A);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_frcz_pd (__m256d __A)
+{
+ return (__m256d) __builtin_ia32_vfrczpd256 ((__v4df)__A);
+}
+
+/* PERMIL2 */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permute2_pd (__m128d __X, __m128d __Y, __m128i __C, const int __I)
+{
+ return (__m128d) __builtin_ia32_vpermil2pd ((__v2df)__X,
+ (__v2df)__Y,
+ (__v2di)__C,
+ __I);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute2_pd (__m256d __X, __m256d __Y, __m256i __C, const int __I)
+{
+ return (__m256d) __builtin_ia32_vpermil2pd256 ((__v4df)__X,
+ (__v4df)__Y,
+ (__v4di)__C,
+ __I);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permute2_ps (__m128 __X, __m128 __Y, __m128i __C, const int __I)
+{
+ return (__m128) __builtin_ia32_vpermil2ps ((__v4sf)__X,
+ (__v4sf)__Y,
+ (__v4si)__C,
+ __I);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute2_ps (__m256 __X, __m256 __Y, __m256i __C, const int __I)
+{
+ return (__m256) __builtin_ia32_vpermil2ps256 ((__v8sf)__X,
+ (__v8sf)__Y,
+ (__v8si)__C,
+ __I);
+}
+#else
+#define _mm_permute2_pd(X, Y, C, I) \
+ ((__m128d) __builtin_ia32_vpermil2pd ((__v2df)(__m128d)(X), \
+ (__v2df)(__m128d)(Y), \
+ (__v2di)(__m128i)(C), \
+ (int)(I)))
+
+#define _mm256_permute2_pd(X, Y, C, I) \
+ ((__m256d) __builtin_ia32_vpermil2pd256 ((__v4df)(__m256d)(X), \
+ (__v4df)(__m256d)(Y), \
+ (__v4di)(__m256i)(C), \
+ (int)(I)))
+
+#define _mm_permute2_ps(X, Y, C, I) \
+ ((__m128) __builtin_ia32_vpermil2ps ((__v4sf)(__m128)(X), \
+ (__v4sf)(__m128)(Y), \
+ (__v4si)(__m128i)(C), \
+ (int)(I)))
+
+#define _mm256_permute2_ps(X, Y, C, I) \
+ ((__m256) __builtin_ia32_vpermil2ps256 ((__v8sf)(__m256)(X), \
+ (__v8sf)(__m256)(Y), \
+ (__v8si)(__m256i)(C), \
+ (int)(I)))
+#endif /* __OPTIMIZE__ */
+
+#ifdef __DISABLE_XOP__
+#undef __DISABLE_XOP__
+#pragma GCC pop_options
+#endif /* __DISABLE_XOP__ */
+
+#endif /* _XOPMMINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2014-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <xsavecintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _XSAVECINTRIN_H_INCLUDED
+#define _XSAVECINTRIN_H_INCLUDED
+
+#ifndef __XSAVEC__
+#pragma GCC push_options
+#pragma GCC target("xsavec")
+#define __DISABLE_XSAVEC__
+#endif /* __XSAVEC__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xsavec (void *__P, long long __M)
+{
+ __builtin_ia32_xsavec (__P, __M);
+}
+
+#ifdef __x86_64__
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xsavec64 (void *__P, long long __M)
+{
+ __builtin_ia32_xsavec64 (__P, __M);
+}
+#endif
+
+#ifdef __DISABLE_XSAVEC__
+#undef __DISABLE_XSAVEC__
+#pragma GCC pop_options
+#endif /* __DISABLE_XSAVEC__ */
+
+#endif /* _XSAVECINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <xsaveintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _XSAVEINTRIN_H_INCLUDED
+#define _XSAVEINTRIN_H_INCLUDED
+
+#ifndef __XSAVE__
+#pragma GCC push_options
+#pragma GCC target("xsave")
+#define __DISABLE_XSAVE__
+#endif /* __XSAVE__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xsave (void *__P, long long __M)
+{
+ __builtin_ia32_xsave (__P, __M);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xrstor (void *__P, long long __M)
+{
+ __builtin_ia32_xrstor (__P, __M);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xsetbv (unsigned int __A, long long __V)
+{
+ __builtin_ia32_xsetbv (__A, __V);
+}
+
+extern __inline long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xgetbv (unsigned int __A)
+{
+ return __builtin_ia32_xgetbv (__A);
+}
+
+#ifdef __x86_64__
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xsave64 (void *__P, long long __M)
+{
+ __builtin_ia32_xsave64 (__P, __M);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xrstor64 (void *__P, long long __M)
+{
+ __builtin_ia32_xrstor64 (__P, __M);
+}
+#endif
+
+#ifdef __DISABLE_XSAVE__
+#undef __DISABLE_XSAVE__
+#pragma GCC pop_options
+#endif /* __DISABLE_XSAVE__ */
+
+#endif /* _XSAVEINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <xsaveoptintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _XSAVEOPTINTRIN_H_INCLUDED
+#define _XSAVEOPTINTRIN_H_INCLUDED
+
+#ifndef __XSAVEOPT__
+#pragma GCC push_options
+#pragma GCC target("xsaveopt")
+#define __DISABLE_XSAVEOPT__
+#endif /* __XSAVEOPT__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xsaveopt (void *__P, long long __M)
+{
+ __builtin_ia32_xsaveopt (__P, __M);
+}
+
+#ifdef __x86_64__
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xsaveopt64 (void *__P, long long __M)
+{
+ __builtin_ia32_xsaveopt64 (__P, __M);
+}
+#endif
+
+#ifdef __DISABLE_XSAVEOPT__
+#undef __DISABLE_XSAVEOPT__
+#pragma GCC pop_options
+#endif /* __DISABLE_XSAVEOPT__ */
+
+#endif /* _XSAVEOPTINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2014-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <xsavesintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _XSAVESINTRIN_H_INCLUDED
+#define _XSAVESINTRIN_H_INCLUDED
+
+#ifndef __XSAVES__
+#pragma GCC push_options
+#pragma GCC target("xsaves")
+#define __DISABLE_XSAVES__
+#endif /* __XSAVES__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xsaves (void *__P, long long __M)
+{
+ __builtin_ia32_xsaves (__P, __M);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xrstors (void *__P, long long __M)
+{
+ __builtin_ia32_xrstors (__P, __M);
+}
+
+#ifdef __x86_64__
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xrstors64 (void *__P, long long __M)
+{
+ __builtin_ia32_xrstors64 (__P, __M);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xsaves64 (void *__P, long long __M)
+{
+ __builtin_ia32_xsaves64 (__P, __M);
+}
+#endif
+
+#ifdef __DISABLE_XSAVES__
+#undef __DISABLE_XSAVES__
+#pragma GCC pop_options
+#endif /* __DISABLE_XSAVES__ */
+
+#endif /* _XSAVESINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <xtestintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _XTESTINTRIN_H_INCLUDED
+#define _XTESTINTRIN_H_INCLUDED
+
+#ifndef __RTM__
+#pragma GCC push_options
+#pragma GCC target("rtm")
+#define __DISABLE_RTM__
+#endif /* __RTM__ */
+
+/* Return non-zero if the instruction executes inside an RTM or HLE code
+ region. Return zero otherwise. */
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xtest (void)
+{
+ return __builtin_ia32_xtest ();
+}
+
+#ifdef __DISABLE_RTM__
+#undef __DISABLE_RTM__
+#pragma GCC pop_options
+#endif /* __DISABLE_RTM__ */
+
+#endif /* _XTESTINTRIN_H_INCLUDED */