]> xenbits.xensource.com Git - unikraft/libs/intel-intrinsics.git/commitdiff
Implement support for GCC
authorAndrei Tatar <andrei@unikraft.io>
Mon, 17 Jul 2023 17:07:59 +0000 (19:07 +0200)
committerUnikraft <monkey@unikraft.io>
Thu, 10 Aug 2023 22:51:11 +0000 (22:51 +0000)
This change adds on-par support for GCC by providing the native x86
intrinsics headers from gcc release 13.1.0.
The file `mm_malloc.h` is taken from upstream `pmm_malloc.h`.

Signed-off-by: Andrei Tatar <andrei@unikraft.io>
Reviewed-by: Maria Sfiraiala <maria.sfiraiala@gmail.com>
Reviewed-by: Radu Nichita <radunichita99@gmail.com>
Approved-by: Razvan Deaconescu <razvand@unikraft.io>
Tested-by: Unikraft CI <monkey@unikraft.io>
GitHub-Closes: #3

100 files changed:
Makefile.uk
include-gcc/adxintrin.h [new file with mode: 0644]
include-gcc/ammintrin.h [new file with mode: 0644]
include-gcc/amxbf16intrin.h [new file with mode: 0644]
include-gcc/amxcomplexintrin.h [new file with mode: 0644]
include-gcc/amxfp16intrin.h [new file with mode: 0644]
include-gcc/amxint8intrin.h [new file with mode: 0644]
include-gcc/amxtileintrin.h [new file with mode: 0644]
include-gcc/avx2intrin.h [new file with mode: 0644]
include-gcc/avx5124fmapsintrin.h [new file with mode: 0644]
include-gcc/avx5124vnniwintrin.h [new file with mode: 0644]
include-gcc/avx512bf16intrin.h [new file with mode: 0644]
include-gcc/avx512bf16vlintrin.h [new file with mode: 0644]
include-gcc/avx512bitalgintrin.h [new file with mode: 0644]
include-gcc/avx512bwintrin.h [new file with mode: 0644]
include-gcc/avx512cdintrin.h [new file with mode: 0644]
include-gcc/avx512dqintrin.h [new file with mode: 0644]
include-gcc/avx512erintrin.h [new file with mode: 0644]
include-gcc/avx512fintrin.h [new file with mode: 0644]
include-gcc/avx512fp16intrin.h [new file with mode: 0644]
include-gcc/avx512fp16vlintrin.h [new file with mode: 0644]
include-gcc/avx512ifmaintrin.h [new file with mode: 0644]
include-gcc/avx512ifmavlintrin.h [new file with mode: 0644]
include-gcc/avx512pfintrin.h [new file with mode: 0644]
include-gcc/avx512vbmi2intrin.h [new file with mode: 0644]
include-gcc/avx512vbmi2vlintrin.h [new file with mode: 0644]
include-gcc/avx512vbmiintrin.h [new file with mode: 0644]
include-gcc/avx512vbmivlintrin.h [new file with mode: 0644]
include-gcc/avx512vlbwintrin.h [new file with mode: 0644]
include-gcc/avx512vldqintrin.h [new file with mode: 0644]
include-gcc/avx512vlintrin.h [new file with mode: 0644]
include-gcc/avx512vnniintrin.h [new file with mode: 0644]
include-gcc/avx512vnnivlintrin.h [new file with mode: 0644]
include-gcc/avx512vp2intersectintrin.h [new file with mode: 0644]
include-gcc/avx512vp2intersectvlintrin.h [new file with mode: 0644]
include-gcc/avx512vpopcntdqintrin.h [new file with mode: 0644]
include-gcc/avx512vpopcntdqvlintrin.h [new file with mode: 0644]
include-gcc/avxifmaintrin.h [new file with mode: 0644]
include-gcc/avxintrin.h [new file with mode: 0644]
include-gcc/avxneconvertintrin.h [new file with mode: 0644]
include-gcc/avxvnniint8intrin.h [new file with mode: 0644]
include-gcc/avxvnniintrin.h [new file with mode: 0644]
include-gcc/bmi2intrin.h [new file with mode: 0644]
include-gcc/bmiintrin.h [new file with mode: 0644]
include-gcc/cetintrin.h [new file with mode: 0644]
include-gcc/cldemoteintrin.h [new file with mode: 0644]
include-gcc/clflushoptintrin.h [new file with mode: 0644]
include-gcc/clwbintrin.h [new file with mode: 0644]
include-gcc/clzerointrin.h [new file with mode: 0644]
include-gcc/cmpccxaddintrin.h [new file with mode: 0644]
include-gcc/emmintrin.h [new file with mode: 0644]
include-gcc/enqcmdintrin.h [new file with mode: 0644]
include-gcc/f16cintrin.h [new file with mode: 0644]
include-gcc/fma4intrin.h [new file with mode: 0644]
include-gcc/fmaintrin.h [new file with mode: 0644]
include-gcc/fxsrintrin.h [new file with mode: 0644]
include-gcc/gfniintrin.h [new file with mode: 0644]
include-gcc/hresetintrin.h [new file with mode: 0644]
include-gcc/ia32intrin.h [new file with mode: 0644]
include-gcc/immintrin.h [new file with mode: 0644]
include-gcc/keylockerintrin.h [new file with mode: 0644]
include-gcc/lwpintrin.h [new file with mode: 0644]
include-gcc/lzcntintrin.h [new file with mode: 0644]
include-gcc/mm3dnow.h [new file with mode: 0644]
include-gcc/mm_malloc.h [new file with mode: 0644]
include-gcc/mmintrin.h [new file with mode: 0644]
include-gcc/movdirintrin.h [new file with mode: 0644]
include-gcc/mwaitintrin.h [new file with mode: 0644]
include-gcc/mwaitxintrin.h [new file with mode: 0644]
include-gcc/pconfigintrin.h [new file with mode: 0644]
include-gcc/pkuintrin.h [new file with mode: 0644]
include-gcc/pmmintrin.h [new file with mode: 0644]
include-gcc/popcntintrin.h [new file with mode: 0644]
include-gcc/prfchiintrin.h [new file with mode: 0644]
include-gcc/prfchwintrin.h [new file with mode: 0644]
include-gcc/raointintrin.h [new file with mode: 0644]
include-gcc/rdseedintrin.h [new file with mode: 0644]
include-gcc/rtmintrin.h [new file with mode: 0644]
include-gcc/serializeintrin.h [new file with mode: 0644]
include-gcc/sgxintrin.h [new file with mode: 0644]
include-gcc/shaintrin.h [new file with mode: 0644]
include-gcc/smmintrin.h [new file with mode: 0644]
include-gcc/tbmintrin.h [new file with mode: 0644]
include-gcc/tmmintrin.h [new file with mode: 0644]
include-gcc/tsxldtrkintrin.h [new file with mode: 0644]
include-gcc/uintrintrin.h [new file with mode: 0644]
include-gcc/vaesintrin.h [new file with mode: 0644]
include-gcc/vpclmulqdqintrin.h [new file with mode: 0644]
include-gcc/waitpkgintrin.h [new file with mode: 0644]
include-gcc/wbnoinvdintrin.h [new file with mode: 0644]
include-gcc/wmmintrin.h [new file with mode: 0644]
include-gcc/x86gprintrin.h [new file with mode: 0644]
include-gcc/x86intrin.h [new file with mode: 0644]
include-gcc/xmmintrin.h [new file with mode: 0644]
include-gcc/xopintrin.h [new file with mode: 0644]
include-gcc/xsavecintrin.h [new file with mode: 0644]
include-gcc/xsaveintrin.h [new file with mode: 0644]
include-gcc/xsaveoptintrin.h [new file with mode: 0644]
include-gcc/xsavesintrin.h [new file with mode: 0644]
include-gcc/xtestintrin.h [new file with mode: 0644]

index b8cc585934296e2179c86290185c81e4f65ed1d7..fb83280ac3a5a255981e6ab2314dc69490828220 100644 (file)
@@ -41,6 +41,9 @@ $(eval $(call addlib_s,libintel_intrinsics,$(CONFIG_LIBINTEL_INTRINSICS)))
 # Library includes
 ################################################################################
 ifeq ($(CONFIG_LIBINTEL_INTRINSICS),y)
+CINCLUDES-$(call have_gcc) += -I$(LIBINTEL_INTRINSICS_BASE)/include-gcc
+CXXINCLUDES-$(call have_gcc) += -I$(LIBINTEL_INTRINSICS_BASE)/include-gcc
+
 CINCLUDES-$(call have_clang) += -I$(LIBINTEL_INTRINSICS_BASE)/include-llvm
 CXXINCLUDES-$(call have_clang) += -I$(LIBINTEL_INTRINSICS_BASE)/include-llvm
 endif
diff --git a/include-gcc/adxintrin.h b/include-gcc/adxintrin.h
new file mode 100644 (file)
index 0000000..e7b9999
--- /dev/null
@@ -0,0 +1,81 @@
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <adxintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _ADXINTRIN_H_INCLUDED
+#define _ADXINTRIN_H_INCLUDED
+
+extern __inline unsigned char
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_subborrow_u32 (unsigned char __CF, unsigned int __X,
+               unsigned int __Y, unsigned int *__P)
+{
+  return __builtin_ia32_sbb_u32 (__CF, __X, __Y, __P);
+}
+
+extern __inline unsigned char
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_addcarry_u32 (unsigned char __CF, unsigned int __X,
+              unsigned int __Y, unsigned int *__P)
+{
+  return __builtin_ia32_addcarryx_u32 (__CF, __X, __Y, __P);
+}
+
+extern __inline unsigned char
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_addcarryx_u32 (unsigned char __CF, unsigned int __X,
+               unsigned int __Y, unsigned int *__P)
+{
+  return __builtin_ia32_addcarryx_u32 (__CF, __X, __Y, __P);
+}
+
+#ifdef __x86_64__
+extern __inline unsigned char
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_subborrow_u64 (unsigned char __CF, unsigned long long __X,
+               unsigned long long __Y, unsigned long long *__P)
+{
+  return __builtin_ia32_sbb_u64 (__CF, __X, __Y, __P);
+}
+
+extern __inline unsigned char
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_addcarry_u64 (unsigned char __CF, unsigned long long __X,
+              unsigned long long __Y, unsigned long long *__P)
+{
+  return __builtin_ia32_addcarryx_u64 (__CF, __X, __Y, __P);
+}
+
+extern __inline unsigned char
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_addcarryx_u64 (unsigned char __CF, unsigned long long __X,
+               unsigned long long __Y, unsigned long long *__P)
+{
+  return __builtin_ia32_addcarryx_u64 (__CF, __X, __Y, __P);
+}
+#endif
+
+#endif /* _ADXINTRIN_H_INCLUDED */
diff --git a/include-gcc/ammintrin.h b/include-gcc/ammintrin.h
new file mode 100644 (file)
index 0000000..24cda1f
--- /dev/null
@@ -0,0 +1,93 @@
+/* Copyright (C) 2007-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Implemented from the specification included in the AMD Programmers
+   Manual Update, version 2.x */
+
+#ifndef _AMMINTRIN_H_INCLUDED
+#define _AMMINTRIN_H_INCLUDED
+
+/* We need definitions from the SSE3, SSE2 and SSE header files*/
+#include <pmmintrin.h>
+
+#ifndef __SSE4A__
+#pragma GCC push_options
+#pragma GCC target("sse4a")
+#define __DISABLE_SSE4A__
+#endif /* __SSE4A__ */
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_sd (double * __P, __m128d __Y)
+{
+  __builtin_ia32_movntsd (__P, (__v2df) __Y);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_ss (float * __P, __m128 __Y)
+{
+  __builtin_ia32_movntss (__P, (__v4sf) __Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_si64 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_extrq ((__v2di) __X, (__v16qi) __Y);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extracti_si64 (__m128i __X, unsigned const int __I, unsigned const int __L)
+{
+  return (__m128i) __builtin_ia32_extrqi ((__v2di) __X, __I, __L);
+}
+#else
+#define _mm_extracti_si64(X, I, L)                                     \
+  ((__m128i) __builtin_ia32_extrqi ((__v2di)(__m128i)(X),              \
+                                   (unsigned int)(I), (unsigned int)(L)))
+#endif
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_si64 (__m128i __X,__m128i __Y)
+{
+  return (__m128i) __builtin_ia32_insertq ((__v2di)__X, (__v2di)__Y);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_inserti_si64(__m128i __X, __m128i __Y, unsigned const int __I, unsigned const int __L)
+{
+  return (__m128i) __builtin_ia32_insertqi ((__v2di)__X, (__v2di)__Y, __I, __L);
+}
+#else
+#define _mm_inserti_si64(X, Y, I, L)                                   \
+  ((__m128i) __builtin_ia32_insertqi ((__v2di)(__m128i)(X),            \
+                                     (__v2di)(__m128i)(Y),             \
+                                     (unsigned int)(I), (unsigned int)(L)))
+#endif
+
+#ifdef __DISABLE_SSE4A__
+#undef __DISABLE_SSE4A__
+#pragma GCC pop_options
+#endif /* __DISABLE_SSE4A__ */
+
+#endif /* _AMMINTRIN_H_INCLUDED */
diff --git a/include-gcc/amxbf16intrin.h b/include-gcc/amxbf16intrin.h
new file mode 100644 (file)
index 0000000..33ee234
--- /dev/null
@@ -0,0 +1,52 @@
+/* Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+#error "Never use <amxbf16intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AMXBF16INTRIN_H_INCLUDED
+#define _AMXBF16INTRIN_H_INCLUDED
+
+#if !defined(__AMX_BF16__)
+#pragma GCC push_options
+#pragma GCC target("amx-bf16")
+#define __DISABLE_AMX_BF16__
+#endif /* __AMX_BF16__ */
+
+#if defined(__x86_64__)
+#define _tile_dpbf16ps_internal(dst,src1,src2)                                 \
+  __asm__ volatile\
+  ("{tdpbf16ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdpbf16ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::)
+
+#define _tile_dpbf16ps(dst,src1,src2)                                  \
+  _tile_dpbf16ps_internal (dst, src1, src2)
+
+#endif
+
+#ifdef __DISABLE_AMX_BF16__
+#undef __DISABLE_AMX_BF16__
+#pragma GCC pop_options
+#endif /* __DISABLE_AMX_BF16__ */
+
+#endif /* _AMXBF16INTRIN_H_INCLUDED */
diff --git a/include-gcc/amxcomplexintrin.h b/include-gcc/amxcomplexintrin.h
new file mode 100644 (file)
index 0000000..6ea1eca
--- /dev/null
@@ -0,0 +1,59 @@
+/* Copyright (C) 2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+#error "Never use <amxcomplexintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AMXCOMPLEXINTRIN_H_INCLUDED
+#define _AMXCOMPLEXINTRIN_H_INCLUDED
+
+#if !defined(__AMX_COMPLEX__)
+#pragma GCC push_options
+#pragma GCC target("amx-complex")
+#define __DISABLE_AMX_COMPLEX__
+#endif /* __AMX_COMPLEX__ */
+
+#if defined(__x86_64__)
+#define _tile_cmmimfp16ps_internal(src1_dst,src2,src3)                         \
+  __asm__ volatile\
+  ("{tcmmimfp16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|tcmmimfp16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::)
+
+#define _tile_cmmrlfp16ps_internal(src1_dst,src2,src3)                         \
+  __asm__ volatile\
+  ("{tcmmrlfp16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|tcmmrlfp16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::)
+
+#define _tile_cmmimfp16ps(src1_dst,src2,src3)                                  \
+  _tile_cmmimfp16ps_internal (src1_dst, src2, src3)
+
+#define _tile_cmmrlfp16ps(src1_dst,src2,src3)                                  \
+  _tile_cmmrlfp16ps_internal (src1_dst, src2, src3)
+
+#endif
+
+#ifdef __DISABLE_AMX_COMPLEX__
+#undef __DISABLE_AMX_COMPLEX__
+#pragma GCC pop_options
+#endif /* __DISABLE_AMX_COMPLEX__ */
+
+#endif /* _AMXCOMPLEXINTRIN_H_INCLUDED */
diff --git a/include-gcc/amxfp16intrin.h b/include-gcc/amxfp16intrin.h
new file mode 100644 (file)
index 0000000..340945b
--- /dev/null
@@ -0,0 +1,46 @@
+/* Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+#error "Never use <amxfp16intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AMXFP16INTRIN_H_INCLUDED
+#define _AMXFP16INTRIN_H_INCLUDED
+
+#if defined(__x86_64__)
+#define _tile_dpfp16ps_internal(dst,src1,src2)                 \
+  __asm__ volatile \
+  ("{tdpfp16ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdpfp16ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::)
+
+#define _tile_dpfp16ps(dst,src1,src2)                          \
+  _tile_dpfp16ps_internal (dst,src1,src2)
+
+#endif
+
+#ifdef __DISABLE_AMX_FP16__
+#undef __DISABLE_AMX_FP16__
+#pragma GCC pop_options
+#endif /* __DISABLE_AMX_FP16__ */
+
+#endif /* _AMXFP16INTRIN_H_INCLUDED */
diff --git a/include-gcc/amxint8intrin.h b/include-gcc/amxint8intrin.h
new file mode 100644 (file)
index 0000000..6b69cfb
--- /dev/null
@@ -0,0 +1,61 @@
+/* Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+#error "Never use <amxint8intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AMXINT8INTRIN_H_INCLUDED
+#define _AMXINT8INTRIN_H_INCLUDED
+
+#if !defined(__AMX_INT8__)
+#pragma GCC push_options
+#pragma GCC target("amx-int8")
+#define __DISABLE_AMX_INT8__
+#endif /* __AMX_INT8__ */
+
+#if defined(__x86_64__)
+#define _tile_int8_dp_internal(name,dst,src1,src2)                                     \
+  __asm__ volatile                                                     \
+  ("{"#name"\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|"#name"\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::)
+
+#define _tile_dpbssd(dst,src1,src2)                                    \
+  _tile_int8_dp_internal (tdpbssd, dst, src1, src2)
+
+#define _tile_dpbsud(dst,src1,src2)                                    \
+  _tile_int8_dp_internal (tdpbsud, dst, src1, src2)
+
+#define _tile_dpbusd(dst,src1,src2)                                    \
+  _tile_int8_dp_internal (tdpbusd, dst, src1, src2)
+
+#define _tile_dpbuud(dst,src1,src2)                                    \
+  _tile_int8_dp_internal (tdpbuud, dst, src1, src2)
+
+#endif
+
+#ifdef __DISABLE_AMX_INT8__
+#undef __DISABLE_AMX_INT8__
+#pragma GCC pop_options
+#endif /* __DISABLE_AMX_INT8__ */
+
+#endif /* _AMXINT8INTRIN_H_INCLUDED */
diff --git a/include-gcc/amxtileintrin.h b/include-gcc/amxtileintrin.h
new file mode 100644 (file)
index 0000000..cc60226
--- /dev/null
@@ -0,0 +1,98 @@
+/* Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+#error "Never use <amxtileintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AMXTILEINTRIN_H_INCLUDED
+#define _AMXTILEINTRIN_H_INCLUDED
+
+#if !defined(__AMX_TILE__)
+#pragma GCC push_options
+#pragma GCC target("amx-tile")
+#define __DISABLE_AMX_TILE__
+#endif /* __AMX_TILE__ */
+
+#if defined(__x86_64__)
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_tile_loadconfig (const void *__config)
+{
+  __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config)));
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_tile_storeconfig (void *__config)
+{
+  __asm__ volatile ("sttilecfg\t%X0" : "=m" (*((void **)__config)));
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_tile_release (void)
+{
+  __asm__ volatile ("tilerelease" ::);
+}
+
+#define _tile_loadd(dst,base,stride)           \
+  _tile_loadd_internal (dst, base, stride)
+
+#define _tile_loadd_internal(dst,base,stride)                          \
+  __asm__ volatile                                                     \
+  ("{tileloadd\t(%0,%1,1), %%tmm"#dst"|tileloadd\t%%tmm"#dst", [%0+%1*1]}" \
+   :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)))
+
+#define _tile_stream_loadd(dst,base,stride)            \
+  _tile_stream_loadd_internal (dst, base, stride)
+
+#define _tile_stream_loadd_internal(dst,base,stride)                   \
+  __asm__ volatile                                                     \
+  ("{tileloaddt1\t(%0,%1,1), %%tmm"#dst"|tileloaddt1\t%%tmm"#dst", [%0+%1*1]}" \
+   :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)))
+
+#define _tile_stored(dst,base,stride)          \
+  _tile_stored_internal (dst, base, stride)
+
+#define _tile_stored_internal(src,base,stride)                         \
+  __asm__ volatile                                                     \
+  ("{tilestored\t%%tmm"#src", (%0,%1,1)|tilestored\t[%0+%1*1], %%tmm"#src"}" \
+   :: "r" ((void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)) \
+   : "memory")
+
+#define _tile_zero(dst)                                \
+  _tile_zero_internal (dst)
+
+#define _tile_zero_internal(dst)               \
+  __asm__ volatile                             \
+  ("tilezero\t%%tmm"#dst ::)
+
+#endif
+
+#ifdef __DISABLE_AMX_TILE__
+#undef __DISABLE_AMX_TILE__
+#pragma GCC pop_options
+#endif /* __DISABLE_AMX_TILE__ */
+
+#endif /* _AMXTILEINTRIN_H_INCLUDED */
diff --git a/include-gcc/avx2intrin.h b/include-gcc/avx2intrin.h
new file mode 100644 (file)
index 0000000..1b9c816
--- /dev/null
@@ -0,0 +1,1923 @@
+/* Copyright (C) 2011-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+# error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX2INTRIN_H_INCLUDED
+#define _AVX2INTRIN_H_INCLUDED
+
+#ifndef __AVX2__
+#pragma GCC push_options
+#pragma GCC target("avx2")
+#define __DISABLE_AVX2__
+#endif /* __AVX2__ */
+
+/* Sum absolute 8-bit integer difference of adjacent groups of 4
+   byte integers in the first 2 operands.  Starting offsets within
+   operands are determined by the 3rd mask operand.  */
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M)
+{
+  return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X,
+                                             (__v32qi)__Y, __M);
+}
+#else
+#define _mm256_mpsadbw_epu8(X, Y, M)                                   \
+  ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X),         \
+                                       (__v32qi)(__m256i)(Y), (int)(M)))
+#endif
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_abs_epi8 (__m256i __A)
+{
+  return (__m256i)__builtin_ia32_pabsb256 ((__v32qi)__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_abs_epi16 (__m256i __A)
+{
+  return (__m256i)__builtin_ia32_pabsw256 ((__v16hi)__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_abs_epi32 (__m256i __A)
+{
+  return (__m256i)__builtin_ia32_pabsd256 ((__v8si)__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_packs_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_packssdw256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_packs_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_packsswb256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_packus_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_packusdw256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_packus_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_add_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i) ((__v32qu)__A + (__v32qu)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_add_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i) ((__v16hu)__A + (__v16hu)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_add_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i) ((__v8su)__A + (__v8su)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_add_epi64 (__m256i __A, __m256i __B)
+{
+  return (__m256i) ((__v4du)__A + (__v4du)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_adds_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_adds_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_paddsw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_adds_epu8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_paddusb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_adds_epu16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N)
+{
+  return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A,
+                                             (__v4di)__B,
+                                             __N * 8);
+}
+#else
+/* In that case (__N*8) will be in vreg, and insn will not be matched. */
+/* Use define instead */
+#define _mm256_alignr_epi8(A, B, N)                               \
+  ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A),     \
+                                       (__v4di)(__m256i)(B),      \
+                                       (int)(N) * 8))
+#endif
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_and_si256 (__m256i __A, __m256i __B)
+{
+  return (__m256i) ((__v4du)__A & (__v4du)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_andnot_si256 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_andnotsi256 ((__v4di)__A, (__v4di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_avg_epu8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pavgb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_avg_epu16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pavgw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M)
+{
+  return (__m256i) __builtin_ia32_pblendvb256 ((__v32qi)__X,
+                                              (__v32qi)__Y,
+                                              (__v32qi)__M);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M)
+{
+  return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X,
+                                             (__v16hi)__Y,
+                                              __M);
+}
+#else
+#define _mm256_blend_epi16(X, Y, M)                                    \
+  ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X),         \
+                                       (__v16hi)(__m256i)(Y), (int)(M)))
+#endif
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpeq_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i) ((__v32qi)__A == (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpeq_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i) ((__v16hi)__A == (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpeq_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i) ((__v8si)__A == (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpeq_epi64 (__m256i __A, __m256i __B)
+{
+  return (__m256i) ((__v4di)__A == (__v4di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpgt_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i) ((__v32qs)__A > (__v32qs)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpgt_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i) ((__v16hi)__A > (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpgt_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i) ((__v8si)__A > (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpgt_epi64 (__m256i __A, __m256i __B)
+{
+  return (__m256i) ((__v4di)__A > (__v4di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hadd_epi16 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_phaddw256 ((__v16hi)__X,
+                                            (__v16hi)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hadd_epi32 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_phaddd256 ((__v8si)__X, (__v8si)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hadds_epi16 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_phaddsw256 ((__v16hi)__X,
+                                             (__v16hi)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hsub_epi16 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_phsubw256 ((__v16hi)__X,
+                                            (__v16hi)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hsub_epi32 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_phsubd256 ((__v8si)__X, (__v8si)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hsubs_epi16 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_phsubsw256 ((__v16hi)__X,
+                                             (__v16hi)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maddubs_epi16 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_pmaddubsw256 ((__v32qi)__X,
+                                               (__v32qi)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_madd_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmaddwd256 ((__v16hi)__A,
+                                            (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmaxsb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmaxsd256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_epu8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmaxub256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_epu16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_epu32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmaxud256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pminsb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pminsd256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_epu8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pminub256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_epu16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_epu32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pminud256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movemask_epi8 (__m256i __A)
+{
+  return __builtin_ia32_pmovmskb256 ((__v32qi)__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi8_epi16 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovsxbw256 ((__v16qi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi8_epi32 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovsxbd256 ((__v16qi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi8_epi64 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovsxbq256 ((__v16qi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi16_epi32 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi16_epi64 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovsxwq256 ((__v8hi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi32_epi64 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovsxdq256 ((__v4si)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu8_epi16 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovzxbw256 ((__v16qi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu8_epi32 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovzxbd256 ((__v16qi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu8_epi64 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovzxbq256 ((__v16qi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu16_epi32 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovzxwd256 ((__v8hi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu16_epi64 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovzxwq256 ((__v8hi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu32_epi64 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovzxdq256 ((__v4si)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mul_epi32 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_pmuldq256 ((__v8si)__X, (__v8si)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mulhrs_epi16 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_pmulhrsw256 ((__v16hi)__X,
+                                              (__v16hi)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mulhi_epu16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmulhuw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mulhi_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mullo_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i) ((__v16hu)__A * (__v16hu)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mullo_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i) ((__v8su)__A * (__v8su)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mul_epu32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_or_si256 (__m256i __A, __m256i __B)
+{
+  return (__m256i) ((__v4du)__A | (__v4du)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sad_epu8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_psadbw256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shuffle_epi8 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X,
+                                            (__v32qi)__Y);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shuffle_epi32 (__m256i __A, const int __mask)
+{
+  return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shufflehi_epi16 (__m256i __A, const int __mask)
+{
+  return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shufflelo_epi16 (__m256i __A, const int __mask)
+{
+  return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask);
+}
+#else
+#define _mm256_shuffle_epi32(A, N) \
+  ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N)))
+#define _mm256_shufflehi_epi16(A, N) \
+  ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N)))
+#define _mm256_shufflelo_epi16(A, N) \
+  ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N)))
+#endif
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sign_epi8 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psignb256 ((__v32qi)__X, (__v32qi)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sign_epi16 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psignw256 ((__v16hi)__X, (__v16hi)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sign_epi32 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_bslli_epi128 (__m256i __A, const int __N)
+{
+  return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_slli_si256 (__m256i __A, const int __N)
+{
+  return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8);
+}
+#else
+#define _mm256_bslli_epi128(A, N) \
+  ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
+#define _mm256_slli_si256(A, N) \
+  ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8))
+#endif
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_slli_epi16 (__m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sll_epi16 (__m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_slli_epi32 (__m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sll_epi32 (__m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_slli_epi64 (__m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sll_epi64 (__m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srai_epi16 (__m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sra_epi16 (__m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_psraw256 ((__v16hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srai_epi32 (__m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_psradi256 ((__v8si)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sra_epi32 (__m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_bsrli_epi128 (__m256i __A, const int __N)
+{
+  return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srli_si256 (__m256i __A, const int __N)
+{
+  return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8);
+}
+#else
+#define _mm256_bsrli_epi128(A, N) \
+  ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
+#define _mm256_srli_si256(A, N) \
+  ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8))
+#endif
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srli_epi16 (__m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srl_epi16 (__m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srli_epi32 (__m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srl_epi32 (__m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srli_epi64 (__m256i __A, int __B)
+{
+  return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srl_epi64 (__m256i __A, __m128i __B)
+{
+  return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sub_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i) ((__v32qu)__A - (__v32qu)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sub_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i) ((__v16hu)__A - (__v16hu)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sub_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i) ((__v8su)__A - (__v8su)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sub_epi64 (__m256i __A, __m256i __B)
+{
+  return (__m256i) ((__v4du)__A - (__v4du)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_subs_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_subs_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_psubsw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_subs_epu8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_psubusb256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_subs_epu16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_psubusw256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpackhi_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_punpckhbw256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpackhi_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_punpckhwd256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpackhi_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_punpckhdq256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpackhi_epi64 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_punpckhqdq256 ((__v4di)__A, (__v4di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpacklo_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_punpcklbw256 ((__v32qi)__A, (__v32qi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpacklo_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_punpcklwd256 ((__v16hi)__A, (__v16hi)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpacklo_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_punpckldq256 ((__v8si)__A, (__v8si)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpacklo_epi64 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_punpcklqdq256 ((__v4di)__A, (__v4di)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_xor_si256 (__m256i __A, __m256i __B)
+{
+  return (__m256i) ((__v4du)__A ^ (__v4du)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_stream_load_si256 (__m256i const *__X)
+{
+  return (__m256i) __builtin_ia32_movntdqa256 ((__v4di *) __X);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcastss_ps (__m128 __X)
+{
+  return (__m128) __builtin_ia32_vbroadcastss_ps ((__v4sf)__X);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastss_ps (__m128 __X)
+{
+  return (__m256) __builtin_ia32_vbroadcastss_ps256 ((__v4sf)__X);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastsd_pd (__m128d __X)
+{
+  return (__m256d) __builtin_ia32_vbroadcastsd_pd256 ((__v2df)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastsi128_si256 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X);
+}
+
+#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X)
+#define _mm_broadcastsd_pd(X) _mm_movedup_pd(X)
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M)
+{
+  return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X,
+                                             (__v4si)__Y,
+                                             __M);
+}
+#else
+#define _mm_blend_epi32(X, Y, M)                                       \
+  ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X),          \
+                                       (__v4si)(__m128i)(Y), (int)(M)))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M)
+{
+  return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X,
+                                             (__v8si)__Y,
+                                             __M);
+}
+#else
+#define _mm256_blend_epi32(X, Y, M)                                    \
+  ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X),          \
+                                       (__v8si)(__m256i)(Y), (int)(M)))
+#endif
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastb_epi8 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pbroadcastb256 ((__v16qi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastw_epi16 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pbroadcastw256 ((__v8hi)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastd_epi32 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pbroadcastd256 ((__v4si)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastq_epi64 (__m128i __X)
+{
+  return (__m256i) __builtin_ia32_pbroadcastq256 ((__v2di)__X);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcastb_epi8 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pbroadcastb128 ((__v16qi)__X);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcastw_epi16 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pbroadcastw128 ((__v8hi)__X);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcastd_epi32 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pbroadcastd128 ((__v4si)__X);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcastq_epi64 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pbroadcastq128 ((__v2di)__X);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute4x64_pd (__m256d __X, const int __M)
+{
+  return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M);
+}
+#else
+#define _mm256_permute4x64_pd(X, M)                           \
+  ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M)))
+#endif
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutevar8x32_ps (__m256 __X, __m256i __Y)
+{
+  return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X, (__v8si)__Y);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute4x64_epi64 (__m256i __X, const int __M)
+{
+  return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M);
+}
+#else
+#define _mm256_permute4x64_epi64(X, M)                        \
+  ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M)))
+#endif
+
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M)
+{
+  return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M);
+}
+#else
+#define _mm256_permute2x128_si256(X, Y, M)                             \
+  ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M)))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extracti128_si256 (__m256i __X, const int __M)
+{
+  return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M);
+}
+#else
+#define _mm256_extracti128_si256(X, M)                         \
+  ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M)))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M)
+{
+  return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M);
+}
+#else
+#define _mm256_inserti128_si256(X, Y, M)                        \
+  ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), \
+                                          (__v2di)(__m128i)(Y), \
+                                          (int)(M)))
+#endif
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskload_epi32 (int const *__X, __m256i __M )
+{
+  return (__m256i) __builtin_ia32_maskloadd256 ((const __v8si *)__X,
+                                               (__v8si)__M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskload_epi64 (long long const *__X, __m256i __M )
+{
+  return (__m256i) __builtin_ia32_maskloadq256 ((const __v4di *)__X,
+                                               (__v4di)__M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskload_epi32 (int const *__X, __m128i __M )
+{
+  return (__m128i) __builtin_ia32_maskloadd ((const __v4si *)__X,
+                                            (__v4si)__M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskload_epi64 (long long const *__X, __m128i __M )
+{
+  return (__m128i) __builtin_ia32_maskloadq ((const __v2di *)__X,
+                                            (__v2di)__M);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y )
+{
+  __builtin_ia32_maskstored256 ((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y )
+{
+  __builtin_ia32_maskstoreq256 ((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y )
+{
+  __builtin_ia32_maskstored ((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y )
+{
+  __builtin_ia32_maskstoreq (( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sllv_epi32 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psllv8si ((__v8si)__X, (__v8si)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sllv_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psllv4si ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sllv_epi64 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psllv4di ((__v4di)__X, (__v4di)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sllv_epi64 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psllv2di ((__v2di)__X, (__v2di)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srav_epi32 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psrav8si ((__v8si)__X, (__v8si)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srav_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psrav4si ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srlv_epi32 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psrlv8si ((__v8si)__X, (__v8si)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srlv_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psrlv4si ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srlv_epi64 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psrlv4di ((__v4di)__X, (__v4di)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srlv_epi64 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i32gather_pd (double const *__base, __m128i __index, const int __scale)
+{
+  __v2df __zero = _mm_setzero_pd ();
+  __v2df __mask = _mm_cmpeq_pd (__zero, __zero);
+
+  return (__m128d) __builtin_ia32_gathersiv2df (_mm_undefined_pd (),
+                                               __base,
+                                               (__v4si)__index,
+                                               __mask,
+                                               __scale);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i32gather_pd (__m128d __src, double const *__base, __m128i __index,
+                      __m128d __mask, const int __scale)
+{
+  return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)__src,
+                                               __base,
+                                               (__v4si)__index,
+                                               (__v2df)__mask,
+                                               __scale);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i32gather_pd (double const *__base, __m128i __index, const int __scale)
+{
+  __v4df __zero = _mm256_setzero_pd ();
+  __v4df __mask = _mm256_cmp_pd (__zero, __zero, _CMP_EQ_OQ);
+
+  return (__m256d) __builtin_ia32_gathersiv4df (_mm256_undefined_pd (),
+                                               __base,
+                                               (__v4si)__index,
+                                               __mask,
+                                               __scale);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i32gather_pd (__m256d __src, double const *__base,
+                         __m128i __index, __m256d __mask, const int __scale)
+{
+  return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)__src,
+                                               __base,
+                                               (__v4si)__index,
+                                               (__v4df)__mask,
+                                               __scale);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i64gather_pd (double const *__base, __m128i __index, const int __scale)
+{
+  __v2df __src = _mm_setzero_pd ();
+  __v2df __mask = _mm_cmpeq_pd (__src, __src);
+
+  return (__m128d) __builtin_ia32_gatherdiv2df (__src,
+                                               __base,
+                                               (__v2di)__index,
+                                               __mask,
+                                               __scale);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i64gather_pd (__m128d __src, double const *__base, __m128i __index,
+                      __m128d __mask, const int __scale)
+{
+  return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)__src,
+                                               __base,
+                                               (__v2di)__index,
+                                               (__v2df)__mask,
+                                               __scale);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i64gather_pd (double const *__base, __m256i __index, const int __scale)
+{
+  __v4df __src = _mm256_setzero_pd ();
+  __v4df __mask = _mm256_cmp_pd (__src, __src, _CMP_EQ_OQ);
+
+  return (__m256d) __builtin_ia32_gatherdiv4df (__src,
+                                               __base,
+                                               (__v4di)__index,
+                                               __mask,
+                                               __scale);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i64gather_pd (__m256d __src, double const *__base,
+                         __m256i __index, __m256d __mask, const int __scale)
+{
+  return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)__src,
+                                               __base,
+                                               (__v4di)__index,
+                                               (__v4df)__mask,
+                                               __scale);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i32gather_ps (float const *__base, __m128i __index, const int __scale)
+{
+  __v4sf __src = _mm_setzero_ps ();
+  __v4sf __mask = _mm_cmpeq_ps (__src, __src);
+
+  return (__m128) __builtin_ia32_gathersiv4sf (__src,
+                                              __base,
+                                              (__v4si)__index,
+                                              __mask,
+                                              __scale);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i32gather_ps (__m128 __src, float const *__base, __m128i __index,
+                      __m128 __mask, const int __scale)
+{
+  return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)__src,
+                                              __base,
+                                              (__v4si)__index,
+                                              (__v4sf)__mask,
+                                              __scale);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i32gather_ps (float const *__base, __m256i __index, const int __scale)
+{
+  __v8sf __src = _mm256_setzero_ps ();
+  __v8sf __mask = _mm256_cmp_ps (__src, __src, _CMP_EQ_OQ);
+
+  return (__m256) __builtin_ia32_gathersiv8sf (__src,
+                                              __base,
+                                              (__v8si)__index,
+                                              __mask,
+                                              __scale);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i32gather_ps (__m256 __src, float const *__base,
+                         __m256i __index, __m256 __mask, const int __scale)
+{
+  return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)__src,
+                                              __base,
+                                              (__v8si)__index,
+                                              (__v8sf)__mask,
+                                              __scale);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i64gather_ps (float const *__base, __m128i __index, const int __scale)
+{
+  __v4sf __src = _mm_setzero_ps ();
+  __v4sf __mask = _mm_cmpeq_ps (__src, __src);
+
+  return (__m128) __builtin_ia32_gatherdiv4sf (__src,
+                                              __base,
+                                              (__v2di)__index,
+                                              __mask,
+                                              __scale);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i64gather_ps (__m128 __src, float const *__base, __m128i __index,
+                      __m128 __mask, const int __scale)
+{
+  return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)__src,
+                                               __base,
+                                               (__v2di)__index,
+                                               (__v4sf)__mask,
+                                               __scale);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i64gather_ps (float const *__base, __m256i __index, const int __scale)
+{
+  __v4sf __src = _mm_setzero_ps ();
+  __v4sf __mask = _mm_cmpeq_ps (__src, __src);
+
+  return (__m128) __builtin_ia32_gatherdiv4sf256 (__src,
+                                                 __base,
+                                                 (__v4di)__index,
+                                                 __mask,
+                                                 __scale);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i64gather_ps (__m128 __src, float const *__base,
+                         __m256i __index, __m128 __mask, const int __scale)
+{
+  return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)__src,
+                                                 __base,
+                                                 (__v4di)__index,
+                                                 (__v4sf)__mask,
+                                                 __scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i32gather_epi64 (long long int const *__base,
+                    __m128i __index, const int __scale)
+{
+  __v2di __src = __extension__ (__v2di){ 0, 0 };
+  __v2di __mask = __extension__ (__v2di){ ~0, ~0 };
+
+  return (__m128i) __builtin_ia32_gathersiv2di (__src,
+                                               __base,
+                                               (__v4si)__index,
+                                               __mask,
+                                               __scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i32gather_epi64 (__m128i __src, long long int const *__base,
+                         __m128i __index, __m128i __mask, const int __scale)
+{
+  return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)__src,
+                                               __base,
+                                               (__v4si)__index,
+                                               (__v2di)__mask,
+                                               __scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i32gather_epi64 (long long int const *__base,
+                       __m128i __index, const int __scale)
+{
+  __v4di __src = __extension__ (__v4di){ 0, 0, 0, 0 };
+  __v4di __mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
+
+  return (__m256i) __builtin_ia32_gathersiv4di (__src,
+                                               __base,
+                                               (__v4si)__index,
+                                               __mask,
+                                               __scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i32gather_epi64 (__m256i __src, long long int const *__base,
+                            __m128i __index, __m256i __mask,
+                            const int __scale)
+{
+  return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)__src,
+                                               __base,
+                                               (__v4si)__index,
+                                               (__v4di)__mask,
+                                               __scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i64gather_epi64 (long long int const *__base,
+                    __m128i __index, const int __scale)
+{
+  __v2di __src = __extension__ (__v2di){ 0, 0 };
+  __v2di __mask = __extension__ (__v2di){ ~0, ~0 };
+
+  return (__m128i) __builtin_ia32_gatherdiv2di (__src,
+                                               __base,
+                                               (__v2di)__index,
+                                               __mask,
+                                               __scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i64gather_epi64 (__m128i __src, long long int const *__base,
+                         __m128i __index, __m128i __mask, const int __scale)
+{
+  return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)__src,
+                                               __base,
+                                               (__v2di)__index,
+                                               (__v2di)__mask,
+                                               __scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i64gather_epi64 (long long int const *__base,
+                       __m256i __index, const int __scale)
+{
+  __v4di __src = __extension__ (__v4di){ 0, 0, 0, 0 };
+  __v4di __mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 };
+
+  return (__m256i) __builtin_ia32_gatherdiv4di (__src,
+                                               __base,
+                                               (__v4di)__index,
+                                               __mask,
+                                               __scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i64gather_epi64 (__m256i __src, long long int const *__base,
+                            __m256i __index, __m256i __mask,
+                            const int __scale)
+{
+  return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)__src,
+                                               __base,
+                                               (__v4di)__index,
+                                               (__v4di)__mask,
+                                               __scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i32gather_epi32 (int const *__base, __m128i __index, const int __scale)
+{
+  __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 };
+  __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
+
+  return (__m128i) __builtin_ia32_gathersiv4si (__src,
+                                               __base,
+                                               (__v4si)__index,
+                                               __mask,
+                                               __scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i32gather_epi32 (__m128i __src, int const *__base, __m128i __index,
+                         __m128i __mask, const int __scale)
+{
+  return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)__src,
+                                               __base,
+                                               (__v4si)__index,
+                                               (__v4si)__mask,
+                                               __scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i32gather_epi32 (int const *__base, __m256i __index, const int __scale)
+{
+  __v8si __src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 };
+  __v8si __mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 };
+
+  return (__m256i) __builtin_ia32_gathersiv8si (__src,
+                                               __base,
+                                               (__v8si)__index,
+                                               __mask,
+                                               __scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i32gather_epi32 (__m256i __src, int const *__base,
+                            __m256i __index, __m256i __mask,
+                            const int __scale)
+{
+  return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)__src,
+                                               __base,
+                                               (__v8si)__index,
+                                               (__v8si)__mask,
+                                               __scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i64gather_epi32 (int const *__base, __m128i __index, const int __scale)
+{
+  __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 };
+  __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
+
+  return (__m128i) __builtin_ia32_gatherdiv4si (__src,
+                                               __base,
+                                               (__v2di)__index,
+                                               __mask,
+                                               __scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i64gather_epi32 (__m128i __src, int const *__base, __m128i __index,
+                         __m128i __mask, const int __scale)
+{
+  return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)__src,
+                                               __base,
+                                               (__v2di)__index,
+                                               (__v4si)__mask,
+                                               __scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i64gather_epi32 (int const *__base, __m256i __index, const int __scale)
+{
+  __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 };
+  __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 };
+
+  return (__m128i) __builtin_ia32_gatherdiv4si256 (__src,
+                                                  __base,
+                                                  (__v4di)__index,
+                                                  __mask,
+                                                  __scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i64gather_epi32 (__m128i __src, int const *__base,
+                            __m256i __index, __m128i __mask,
+                            const int __scale)
+{
+  return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)__src,
+                                                  __base,
+                                                  (__v4di)__index,
+                                                  (__v4si)__mask,
+                                                  __scale);
+}
+#else /* __OPTIMIZE__ */
+#define _mm_i32gather_pd(BASE, INDEX, SCALE)                           \
+  (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (),   \
+                                        (double const *) (BASE),       \
+                                        (__v4si)(__m128i) (INDEX),     \
+                                        (__v2df)                       \
+                                        _mm_cmpeq_pd (_mm_setzero_pd (),\
+                                                      _mm_setzero_pd ()),\
+                                        (int) (SCALE))
+
+#define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE)           \
+  (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d) (SRC),      \
+                                        (double const *) (BASE),       \
+                                        (__v4si)(__m128i) (INDEX),     \
+                                        (__v2df)(__m128d) (MASK),      \
+                                        (int) (SCALE))
+
+#define _mm256_i32gather_pd(BASE, INDEX, SCALE)                                \
+  (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (),        \
+                                        (double const *) (BASE),       \
+                                        (__v4si)(__m128i) (INDEX),     \
+                                        (__v4df)                       \
+                                        _mm256_cmp_pd (_mm256_setzero_pd (),\
+                                                       _mm256_setzero_pd (),\
+                                                       _CMP_EQ_OQ),    \
+                                        (int) (SCALE))
+
+#define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE)                \
+  (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d) (SRC),      \
+                                        (double const *) (BASE),       \
+                                        (__v4si)(__m128i) (INDEX),     \
+                                        (__v4df)(__m256d) (MASK),      \
+                                        (int) (SCALE))
+
+#define _mm_i64gather_pd(BASE, INDEX, SCALE)                           \
+  (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (),   \
+                                        (double const *) (BASE),       \
+                                        (__v2di)(__m128i) (INDEX),     \
+                                        (__v2df)                       \
+                                        _mm_cmpeq_pd (_mm_setzero_pd (),\
+                                                      _mm_setzero_pd ()),\
+                                        (int) (SCALE))
+
+#define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE)           \
+  (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d) (SRC),      \
+                                        (double const *) (BASE),       \
+                                        (__v2di)(__m128i) (INDEX),     \
+                                        (__v2df)(__m128d) (MASK),      \
+                                        (int) (SCALE))
+
+#define _mm256_i64gather_pd(BASE, INDEX, SCALE)                                \
+  (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (),        \
+                                        (double const *) (BASE),       \
+                                        (__v4di)(__m256i) (INDEX),     \
+                                        (__v4df)                       \
+                                        _mm256_cmp_pd (_mm256_setzero_pd (),\
+                                                       _mm256_setzero_pd (),\
+                                                       _CMP_EQ_OQ),    \
+                                        (int) (SCALE))
+
+#define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE)                \
+  (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d) (SRC),      \
+                                        (double const *) (BASE),       \
+                                        (__v4di)(__m256i) (INDEX),     \
+                                        (__v4df)(__m256d) (MASK),      \
+                                        (int) (SCALE))
+
+#define _mm_i32gather_ps(BASE, INDEX, SCALE)                           \
+  (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (),    \
+                                       (float const *) (BASE),         \
+                                       (__v4si)(__m128i) (INDEX),      \
+                                       (__v4sf)                        \
+                                       _mm_cmpeq_ps (_mm_setzero_ps (),\
+                                                     _mm_setzero_ps ()),\
+                                       (int) (SCALE))
+
+#define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE)           \
+  (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128) (SRC),                \
+                                       (float const *) (BASE),         \
+                                       (__v4si)(__m128i) (INDEX),      \
+                                       (__v4sf)(__m128) (MASK),        \
+                                       (int) (SCALE))
+
+#define _mm256_i32gather_ps(BASE, INDEX, SCALE)                                \
+  (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \
+                                       (float const *) (BASE),         \
+                                       (__v8si)(__m256i) (INDEX),      \
+                                       (__v8sf)                        \
+                                       _mm256_cmp_ps (_mm256_setzero_ps (),\
+                                                      _mm256_setzero_ps (),\
+                                                      _CMP_EQ_OQ),     \
+                                       (int) (SCALE))
+
+#define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE)                \
+  (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256) (SRC),                \
+                                       (float const *) (BASE),         \
+                                       (__v8si)(__m256i) (INDEX),      \
+                                       (__v8sf)(__m256) (MASK),        \
+                                       (int) (SCALE))
+
+#define _mm_i64gather_ps(BASE, INDEX, SCALE)                           \
+  (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (),    \
+                                       (float const *) (BASE),         \
+                                       (__v2di)(__m128i) (INDEX),      \
+                                       (__v4sf)                        \
+                                       _mm_cmpeq_ps (_mm_setzero_ps (),\
+                                                     _mm_setzero_ps ()),\
+                                       (int) (SCALE))
+
+#define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE)           \
+  (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128) (SRC),                \
+                                       (float const *) (BASE),         \
+                                       (__v2di)(__m128i) (INDEX),      \
+                                       (__v4sf)(__m128) (MASK),        \
+                                       (int) (SCALE))
+
+#define _mm256_i64gather_ps(BASE, INDEX, SCALE)                                \
+  (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (), \
+                                          (float const *) (BASE),      \
+                                          (__v4di)(__m256i) (INDEX),   \
+                                          (__v4sf)                     \
+                                          _mm_cmpeq_ps (_mm_setzero_ps (),\
+                                                        _mm_setzero_ps ()),\
+                                          (int) (SCALE))
+
+#define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE)                \
+  (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128) (SRC),     \
+                                          (float const *) (BASE),      \
+                                          (__v4di)(__m256i) (INDEX),   \
+                                          (__v4sf)(__m128) (MASK),     \
+                                          (int) (SCALE))
+
+#define _mm_i32gather_epi64(BASE, INDEX, SCALE)                                \
+  (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), \
+                                        (long long const *) (BASE),    \
+                                        (__v4si)(__m128i) (INDEX),     \
+                                        (__v2di)_mm_set1_epi64x (-1),  \
+                                        (int) (SCALE))
+
+#define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE)                \
+  (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i) (SRC),      \
+                                        (long long const *) (BASE),    \
+                                        (__v4si)(__m128i) (INDEX),     \
+                                        (__v2di)(__m128i) (MASK),      \
+                                        (int) (SCALE))
+
+#define _mm256_i32gather_epi64(BASE, INDEX, SCALE)                        \
+  (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), \
+                                        (long long const *) (BASE),       \
+                                        (__v4si)(__m128i) (INDEX),        \
+                                        (__v4di)_mm256_set1_epi64x (-1),  \
+                                        (int) (SCALE))
+
+#define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE)     \
+  (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i) (SRC),      \
+                                        (long long const *) (BASE),    \
+                                        (__v4si)(__m128i) (INDEX),     \
+                                        (__v4di)(__m256i) (MASK),      \
+                                        (int) (SCALE))
+
+#define _mm_i64gather_epi64(BASE, INDEX, SCALE)                                \
+  (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), \
+                                        (long long const *) (BASE),    \
+                                        (__v2di)(__m128i) (INDEX),     \
+                                        (__v2di)_mm_set1_epi64x (-1),  \
+                                        (int) (SCALE))
+
+#define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE)                \
+  (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i) (SRC),      \
+                                        (long long const *) (BASE),    \
+                                        (__v2di)(__m128i) (INDEX),     \
+                                        (__v2di)(__m128i) (MASK),      \
+                                        (int) (SCALE))
+
+#define _mm256_i64gather_epi64(BASE, INDEX, SCALE)                        \
+  (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), \
+                                        (long long const *) (BASE),       \
+                                        (__v4di)(__m256i) (INDEX),        \
+                                        (__v4di)_mm256_set1_epi64x (-1),  \
+                                        (int) (SCALE))
+
+#define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE)     \
+  (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i) (SRC),      \
+                                        (long long const *) (BASE),    \
+                                        (__v4di)(__m256i) (INDEX),     \
+                                        (__v4di)(__m256i) (MASK),      \
+                                        (int) (SCALE))
+
+#define _mm_i32gather_epi32(BASE, INDEX, SCALE)                                \
+  (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (),        \
+                                        (int const *) (BASE),          \
+                                        (__v4si)(__m128i) (INDEX),     \
+                                        (__v4si)_mm_set1_epi32 (-1),   \
+                                        (int) (SCALE))
+
+#define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE)                \
+  (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i) (SRC),      \
+                                       (int const *) (BASE),           \
+                                       (__v4si)(__m128i) (INDEX),      \
+                                       (__v4si)(__m128i) (MASK),       \
+                                       (int) (SCALE))
+
+#define _mm256_i32gather_epi32(BASE, INDEX, SCALE)                        \
+  (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), \
+                                        (int const *) (BASE),             \
+                                        (__v8si)(__m256i) (INDEX),        \
+                                        (__v8si)_mm256_set1_epi32 (-1),   \
+                                        (int) (SCALE))
+
+#define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE)     \
+  (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i) (SRC),      \
+                                       (int const *) (BASE),           \
+                                       (__v8si)(__m256i) (INDEX),      \
+                                       (__v8si)(__m256i) (MASK),       \
+                                       (int) (SCALE))
+
+#define _mm_i64gather_epi32(BASE, INDEX, SCALE)                                \
+  (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (),        \
+                                        (int const *) (BASE),          \
+                                        (__v2di)(__m128i) (INDEX),     \
+                                        (__v4si)_mm_set1_epi32 (-1),   \
+                                        (int) (SCALE))
+
+#define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE)                \
+  (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i) (SRC),      \
+                                       (int const *) (BASE),           \
+                                       (__v2di)(__m128i) (INDEX),      \
+                                       (__v4si)(__m128i) (MASK),       \
+                                       (int) (SCALE))
+
+#define _mm256_i64gather_epi32(BASE, INDEX, SCALE)                        \
+  (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), \
+                                           (int const *) (BASE),          \
+                                           (__v4di)(__m256i) (INDEX),     \
+                                           (__v4si)_mm_set1_epi32(-1),    \
+                                           (int) (SCALE))
+
+#define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE)     \
+  (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i) (SRC),   \
+                                          (int const *) (BASE),        \
+                                          (__v4di)(__m256i) (INDEX),   \
+                                          (__v4si)(__m128i) (MASK),    \
+                                          (int) (SCALE))
+#endif  /* __OPTIMIZE__ */
+
+#ifdef __DISABLE_AVX2__
+#undef __DISABLE_AVX2__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX2__ */
+
+#endif /* _AVX2INTRIN_H_INCLUDED */
diff --git a/include-gcc/avx5124fmapsintrin.h b/include-gcc/avx5124fmapsintrin.h
new file mode 100644 (file)
index 0000000..97dd77c
--- /dev/null
@@ -0,0 +1,216 @@
+/* Copyright (C) 2015-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+# error "Never use <avx5124fmapsintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _AVX5124FMAPSINTRIN_H_INCLUDED
+#define _AVX5124FMAPSINTRIN_H_INCLUDED
+
+#ifndef __AVX5124FMAPS__
+#pragma GCC push_options
+#pragma GCC target("avx5124fmaps")
+#define __DISABLE_AVX5124FMAPS__
+#endif /* __AVX5124FMAPS__ */
+
+extern __inline __m512
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_4fmadd_ps (__m512 __A, __m512 __B, __m512 __C,
+                 __m512 __D, __m512 __E, __m128 *__F)
+{
+  return (__m512) __builtin_ia32_4fmaddps ((__v16sf) __B,
+                                          (__v16sf) __C,
+                                          (__v16sf) __D,
+                                          (__v16sf) __E,
+                                          (__v16sf) __A,
+                                          (const __v4sf *) __F);
+}
+
+extern __inline __m512
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_4fmadd_ps (__m512 __A, __mmask16 __U, __m512 __B,
+                      __m512 __C, __m512 __D, __m512 __E, __m128 *__F)
+{
+  return (__m512) __builtin_ia32_4fmaddps_mask ((__v16sf) __B,
+                                               (__v16sf) __C,
+                                               (__v16sf) __D,
+                                               (__v16sf) __E,
+                                               (__v16sf) __A,
+                                               (const __v4sf *) __F,
+                                               (__v16sf) __A,
+                                               (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_4fmadd_ps (__mmask16 __U,
+                       __m512 __A, __m512 __B, __m512 __C,
+                       __m512 __D, __m512 __E, __m128 *__F)
+{
+  return (__m512) __builtin_ia32_4fmaddps_mask ((__v16sf) __B,
+                                               (__v16sf) __C,
+                                               (__v16sf) __D,
+                                               (__v16sf) __E,
+                                               (__v16sf) __A,
+                                               (const __v4sf *) __F,
+                                               (__v16sf) _mm512_setzero_ps (),
+                                               (__mmask16) __U);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_4fmadd_ss (__m128 __A, __m128 __B, __m128 __C,
+              __m128 __D, __m128 __E, __m128 *__F)
+{
+  return (__m128) __builtin_ia32_4fmaddss ((__v4sf) __B,
+                                          (__v4sf) __C,
+                                          (__v4sf) __D,
+                                          (__v4sf) __E,
+                                          (__v4sf) __A,
+                                          (const __v4sf *) __F);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_4fmadd_ss (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C,
+                   __m128 __D, __m128 __E, __m128 *__F)
+{
+  return (__m128) __builtin_ia32_4fmaddss_mask ((__v4sf) __B,
+                                               (__v4sf) __C,
+                                               (__v4sf) __D,
+                                               (__v4sf) __E,
+                                               (__v4sf) __A,
+                                               (const __v4sf *) __F,
+                                               (__v4sf) __A,
+                                               (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_4fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C,
+                    __m128 __D, __m128 __E, __m128 *__F)
+{
+  return (__m128) __builtin_ia32_4fmaddss_mask ((__v4sf) __B,
+                                               (__v4sf) __C,
+                                               (__v4sf) __D,
+                                               (__v4sf) __E,
+                                               (__v4sf) __A,
+                                               (const __v4sf *) __F,
+                                               (__v4sf) _mm_setzero_ps (),
+                                               (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_4fnmadd_ps (__m512 __A, __m512 __B, __m512 __C,
+                  __m512 __D, __m512 __E, __m128 *__F)
+{
+  return (__m512) __builtin_ia32_4fnmaddps ((__v16sf) __B,
+                                           (__v16sf) __C,
+                                           (__v16sf) __D,
+                                           (__v16sf) __E,
+                                           (__v16sf) __A,
+                                           (const __v4sf *) __F);
+}
+
+extern __inline __m512
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_4fnmadd_ps (__m512 __A, __mmask16 __U, __m512 __B,
+                       __m512 __C, __m512 __D, __m512 __E, __m128 *__F)
+{
+  return (__m512) __builtin_ia32_4fnmaddps_mask ((__v16sf) __B,
+                                                (__v16sf) __C,
+                                                (__v16sf) __D,
+                                                (__v16sf) __E,
+                                                (__v16sf) __A,
+                                                (const __v4sf *) __F,
+                                                (__v16sf) __A,
+                                                (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_4fnmadd_ps (__mmask16 __U,
+                        __m512 __A, __m512 __B, __m512 __C,
+                        __m512 __D, __m512 __E, __m128 *__F)
+{
+  return (__m512) __builtin_ia32_4fnmaddps_mask ((__v16sf) __B,
+                                                (__v16sf) __C,
+                                                (__v16sf) __D,
+                                                (__v16sf) __E,
+                                                (__v16sf) __A,
+                                                (const __v4sf *) __F,
+                                                (__v16sf) _mm512_setzero_ps (),
+                                                (__mmask16) __U);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_4fnmadd_ss (__m128 __A, __m128 __B, __m128 __C,
+               __m128 __D, __m128 __E, __m128 *__F)
+{
+  return (__m128) __builtin_ia32_4fnmaddss ((__v4sf) __B,
+                                           (__v4sf) __C,
+                                           (__v4sf) __D,
+                                           (__v4sf) __E,
+                                           (__v4sf) __A,
+                                           (const __v4sf *) __F);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_4fnmadd_ss (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C,
+                    __m128 __D, __m128 __E, __m128 *__F)
+{
+  return (__m128) __builtin_ia32_4fnmaddss_mask ((__v4sf) __B,
+                                                (__v4sf) __C,
+                                                (__v4sf) __D,
+                                                (__v4sf) __E,
+                                                (__v4sf) __A,
+                                                (const __v4sf *) __F,
+                                                (__v4sf) __A,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_4fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C,
+                     __m128 __D, __m128 __E, __m128 *__F)
+{
+  return (__m128) __builtin_ia32_4fnmaddss_mask ((__v4sf) __B,
+                                                (__v4sf) __C,
+                                                (__v4sf) __D,
+                                                (__v4sf) __E,
+                                                (__v4sf) __A,
+                                                (const __v4sf *) __F,
+                                                (__v4sf) _mm_setzero_ps (),
+                                                (__mmask8) __U);
+}
+
+#ifdef __DISABLE_AVX5124FMAPS__
+#undef __DISABLE_AVX5124FMAPS__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX5124FMAPS__ */
+
+#endif /* _AVX5124FMAPSINTRIN_H_INCLUDED */
diff --git a/include-gcc/avx5124vnniwintrin.h b/include-gcc/avx5124vnniwintrin.h
new file mode 100644 (file)
index 0000000..fd12958
--- /dev/null
@@ -0,0 +1,132 @@
+/* Copyright (C) 2015-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+# error "Never use <avx5124vnniwintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _AVX5124VNNIWINTRIN_H_INCLUDED
+#define _AVX5124VNNIWINTRIN_H_INCLUDED
+
+#ifndef __AVX5124VNNIW__
+#pragma GCC push_options
+#pragma GCC target("avx5124vnniw")
+#define __DISABLE_AVX5124VNNIW__
+#endif /* __AVX5124VNNIW__ */
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_4dpwssd_epi32 (__m512i __A, __m512i __B, __m512i __C,
+                     __m512i __D, __m512i __E, __m128i *__F)
+{
+  return (__m512i) __builtin_ia32_vp4dpwssd ((__v16si) __B,
+                                            (__v16si) __C,
+                                            (__v16si) __D,
+                                            (__v16si) __E,
+                                            (__v16si) __A,
+                                            (const __v4si *) __F);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_4dpwssd_epi32 (__m512i __A, __mmask16 __U, __m512i __B,
+                          __m512i __C, __m512i __D, __m512i __E,
+                          __m128i *__F)
+{
+  return (__m512i) __builtin_ia32_vp4dpwssd_mask ((__v16si) __B,
+                                                 (__v16si) __C,
+                                                 (__v16si) __D,
+                                                 (__v16si) __E,
+                                                 (__v16si) __A,
+                                                 (const __v4si *) __F,
+                                                 (__v16si) __A,
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_4dpwssd_epi32 (__mmask16 __U, __m512i __A, __m512i __B,
+                           __m512i __C, __m512i __D, __m512i __E,
+                           __m128i *__F)
+{
+  return (__m512i) __builtin_ia32_vp4dpwssd_mask ((__v16si) __B,
+                                                 (__v16si) __C,
+                                                 (__v16si) __D,
+                                                 (__v16si) __E,
+                                                 (__v16si) __A,
+                                                 (const __v4si *) __F,
+                                                 (__v16si) _mm512_setzero_ps (),
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_4dpwssds_epi32 (__m512i __A, __m512i __B, __m512i __C,
+                      __m512i __D, __m512i __E, __m128i *__F)
+{
+  return (__m512i) __builtin_ia32_vp4dpwssds ((__v16si) __B,
+                                             (__v16si) __C,
+                                             (__v16si) __D,
+                                             (__v16si) __E,
+                                             (__v16si) __A,
+                                             (const __v4si *) __F);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_4dpwssds_epi32 (__m512i __A, __mmask16 __U, __m512i __B,
+                           __m512i __C, __m512i __D, __m512i __E,
+                           __m128i *__F)
+{
+  return (__m512i) __builtin_ia32_vp4dpwssds_mask ((__v16si) __B,
+                                                  (__v16si) __C,
+                                                  (__v16si) __D,
+                                                  (__v16si) __E,
+                                                  (__v16si) __A,
+                                                  (const __v4si *) __F,
+                                                  (__v16si) __A,
+                                                  (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_4dpwssds_epi32 (__mmask16 __U, __m512i __A, __m512i __B,
+                            __m512i __C, __m512i __D, __m512i __E,
+                            __m128i *__F)
+{
+  return (__m512i) __builtin_ia32_vp4dpwssds_mask ((__v16si) __B,
+                                                  (__v16si) __C,
+                                                  (__v16si) __D,
+                                                  (__v16si) __E,
+                                                  (__v16si) __A,
+                                                  (const __v4si *) __F,
+                                                  (__v16si) _mm512_setzero_ps (),
+                                                  (__mmask16) __U);
+}
+
+#ifdef __DISABLE_AVX5124VNNIW__
+#undef __DISABLE_AVX5124VNNIW__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX5124VNNIW__ */
+
+#endif /* _AVX5124VNNIWINTRIN_H_INCLUDED */
diff --git a/include-gcc/avx512bf16intrin.h b/include-gcc/avx512bf16intrin.h
new file mode 100644 (file)
index 0000000..107f4a4
--- /dev/null
@@ -0,0 +1,152 @@
+/* Copyright (C) 2019-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512BF16INTRIN_H_INCLUDED
+#define _AVX512BF16INTRIN_H_INCLUDED
+
+#ifndef __AVX512BF16__
+#pragma GCC push_options
+#pragma GCC target("avx512bf16")
+#define __DISABLE_AVX512BF16__
+#endif /* __AVX512BF16__ */
+
+/* Internal data types for implementing the intrinsics.  */
+typedef __bf16 __v32bf __attribute__ ((__vector_size__ (64)));
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+   vector types, and their scalar components.  */
+typedef __bf16 __m512bh __attribute__ ((__vector_size__ (64), __may_alias__));
+
+/* Convert One BF16 Data to One Single Float Data.  */
+extern __inline float
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsbh_ss (__bf16 __A)
+{
+  return __builtin_ia32_cvtbf2sf (__A);
+}
+
+/* vcvtne2ps2bf16 */
+
+extern __inline __m512bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtne2ps_pbh (__m512 __A, __m512 __B)
+{
+  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf(__A, __B);
+}
+
+extern __inline __m512bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtne2ps_pbh (__m512bh __A, __mmask32 __B, __m512 __C, __m512 __D)
+{
+  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf_mask(__C, __D, __A, __B);
+}
+
+extern __inline __m512bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtne2ps_pbh (__mmask32 __A, __m512 __B, __m512 __C)
+{
+  return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf_maskz(__B, __C, __A);
+}
+
+/* vcvtneps2bf16 */
+
+extern __inline __m256bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtneps_pbh (__m512 __A)
+{
+  return (__m256bh)__builtin_ia32_cvtneps2bf16_v16sf(__A);
+}
+
+extern __inline __m256bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtneps_pbh (__m256bh __A, __mmask16 __B, __m512 __C)
+{
+  return (__m256bh)__builtin_ia32_cvtneps2bf16_v16sf_mask(__C, __A, __B);
+}
+
+extern __inline __m256bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtneps_pbh (__mmask16 __A, __m512 __B)
+{
+  return (__m256bh)__builtin_ia32_cvtneps2bf16_v16sf_maskz(__B, __A);
+}
+
+/* vdpbf16ps */
+
+extern __inline __m512
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_dpbf16_ps (__m512 __A, __m512bh __B, __m512bh __C)
+{
+  return (__m512)__builtin_ia32_dpbf16ps_v16sf(__A, __B, __C);
+}
+
+extern __inline __m512
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_dpbf16_ps (__m512 __A, __mmask16 __B, __m512bh __C, __m512bh __D)
+{
+  return (__m512)__builtin_ia32_dpbf16ps_v16sf_mask(__A, __C, __D, __B);
+}
+
+extern __inline __m512
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_dpbf16_ps (__mmask16 __A, __m512 __B, __m512bh __C, __m512bh __D)
+{
+  return (__m512)__builtin_ia32_dpbf16ps_v16sf_maskz(__B, __C, __D, __A);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtpbh_ps (__m256bh __A)
+{
+  return (__m512)_mm512_castsi512_ps ((__m512i)_mm512_slli_epi32 (
+        (__m512i)_mm512_cvtepi16_epi32 ((__m256i)__A), 16));
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtpbh_ps (__mmask16 __U, __m256bh __A)
+{
+  return (__m512)_mm512_castsi512_ps ((__m512i) _mm512_slli_epi32 (
+        (__m512i)_mm512_maskz_cvtepi16_epi32 (
+        (__mmask16)__U, (__m256i)__A), 16));
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtpbh_ps (__m512 __S, __mmask16 __U, __m256bh __A)
+{
+  return (__m512)_mm512_castsi512_ps ((__m512i)(_mm512_mask_slli_epi32 (
+        (__m512i)__S, (__mmask16)__U,
+        (__m512i)_mm512_cvtepi16_epi32 ((__m256i)__A), 16)));
+}
+
+#ifdef __DISABLE_AVX512BF16__
+#undef __DISABLE_AVX512BF16__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512BF16__ */
+
+#endif /* _AVX512BF16INTRIN_H_INCLUDED */
diff --git a/include-gcc/avx512bf16vlintrin.h b/include-gcc/avx512bf16vlintrin.h
new file mode 100644 (file)
index 0000000..6e8a6a0
--- /dev/null
@@ -0,0 +1,238 @@
+/* Copyright (C) 2019-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512bf16vlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512BF16VLINTRIN_H_INCLUDED
+#define _AVX512BF16VLINTRIN_H_INCLUDED
+
+#if !defined(__AVX512VL__) || !defined(__AVX512BF16__)
+#pragma GCC push_options
+#pragma GCC target("avx512bf16,avx512vl")
+#define __DISABLE_AVX512BF16VL__
+#endif /* __AVX512BF16__ */
+
+/* Internal data types for implementing the intrinsics.  */
+typedef __bf16 __v16bf __attribute__ ((__vector_size__ (32)));
+typedef __bf16 __v8bf __attribute__ ((__vector_size__ (16)));
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+   vector types, and their scalar components.  */
+typedef __bf16 __m256bh __attribute__ ((__vector_size__ (32), __may_alias__));
+typedef __bf16 __m128bh __attribute__ ((__vector_size__ (16), __may_alias__));
+
+typedef __bf16 __bfloat16;
+
+#define _mm256_cvtneps_pbh(A) \
+  (__m128bh) __builtin_ia32_cvtneps2bf16_v8sf (A)
+#define _mm_cvtneps_pbh(A) \
+  (__m128bh) __builtin_ia32_cvtneps2bf16_v4sf (A)
+
+/* vcvtne2ps2bf16 */
+
+extern __inline __m256bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtne2ps_pbh (__m256 __A, __m256 __B)
+{
+  return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf(__A, __B);
+}
+
+extern __inline __m256bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtne2ps_pbh (__m256bh __A, __mmask16 __B, __m256 __C, __m256 __D)
+{
+  return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf_mask(__C, __D, __A, __B);
+}
+
+extern __inline __m256bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtne2ps_pbh (__mmask16 __A, __m256 __B, __m256 __C)
+{
+  return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf_maskz(__B, __C, __A);
+}
+
+extern __inline __m128bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtne2ps_pbh (__m128 __A, __m128 __B)
+{
+  return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf(__A, __B);
+}
+
+extern __inline __m128bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtne2ps_pbh (__m128bh __A, __mmask8 __B, __m128 __C, __m128 __D)
+{
+  return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf_mask(__C, __D, __A, __B);
+}
+
+extern __inline __m128bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtne2ps_pbh (__mmask8 __A, __m128 __B, __m128 __C)
+{
+  return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf_maskz(__B, __C, __A);
+}
+
+/* vcvtneps2bf16 */
+
+extern __inline __m128bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtneps_pbh (__m128bh __A, __mmask8 __B, __m256 __C)
+{
+  return (__m128bh)__builtin_ia32_cvtneps2bf16_v8sf_mask(__C, __A, __B);
+}
+
+extern __inline __m128bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtneps_pbh (__mmask8 __A, __m256 __B)
+{
+  return (__m128bh)__builtin_ia32_cvtneps2bf16_v8sf_maskz(__B, __A);
+}
+
+extern __inline __m128bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtneps_pbh (__m128bh __A, __mmask8 __B, __m128 __C)
+{
+  return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf_mask(__C, __A, __B);
+}
+
+extern __inline __m128bh
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtneps_pbh (__mmask8 __A, __m128 __B)
+{
+  return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf_maskz(__B, __A);
+}
+
+/* vdpbf16ps */
+
+extern __inline __m256
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_dpbf16_ps (__m256 __A, __m256bh __B, __m256bh __C)
+{
+  return (__m256)__builtin_ia32_dpbf16ps_v8sf(__A, __B, __C);
+}
+
+extern __inline __m256
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_dpbf16_ps (__m256 __A, __mmask8 __B, __m256bh __C, __m256bh __D)
+{
+  return (__m256)__builtin_ia32_dpbf16ps_v8sf_mask(__A, __C, __D, __B);
+}
+
+extern __inline __m256
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_dpbf16_ps (__mmask8 __A, __m256 __B, __m256bh __C, __m256bh __D)
+{
+  return (__m256)__builtin_ia32_dpbf16ps_v8sf_maskz(__B, __C, __D, __A);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_dpbf16_ps (__m128 __A, __m128bh __B, __m128bh __C)
+{
+  return (__m128)__builtin_ia32_dpbf16ps_v4sf(__A, __B, __C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_dpbf16_ps (__m128 __A, __mmask8 __B, __m128bh __C, __m128bh __D)
+{
+  return (__m128)__builtin_ia32_dpbf16ps_v4sf_mask(__A, __C, __D, __B);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_dpbf16_ps (__mmask8 __A, __m128 __B, __m128bh __C, __m128bh __D)
+{
+  return (__m128)__builtin_ia32_dpbf16ps_v4sf_maskz(__B, __C, __D, __A);
+}
+
+extern __inline __bf16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtness_sbh (float __A)
+{
+  __v4sf __V = {__A, 0, 0, 0};
+  __v8bf __R = __builtin_ia32_cvtneps2bf16_v4sf_mask ((__v4sf)__V,
+              (__v8bf)_mm_undefined_si128 (), (__mmask8)-1);
+  return __R[0];
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpbh_ps (__m128bh __A)
+{
+  return (__m128)_mm_castsi128_ps ((__m128i)_mm_slli_epi32 (
+        (__m128i)_mm_cvtepi16_epi32 ((__m128i)__A), 16));
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtpbh_ps (__m128bh __A)
+{
+  return (__m256)_mm256_castsi256_ps ((__m256i)_mm256_slli_epi32 (
+        (__m256i)_mm256_cvtepi16_epi32 ((__m128i)__A), 16));
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtpbh_ps (__mmask8 __U, __m128bh __A)
+{
+  return (__m128)_mm_castsi128_ps ((__m128i)_mm_slli_epi32 (
+        (__m128i)_mm_maskz_cvtepi16_epi32 (
+        (__mmask8)__U, (__m128i)__A), 16));
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtpbh_ps (__mmask8 __U, __m128bh __A)
+{
+  return (__m256)_mm256_castsi256_ps ((__m256i)_mm256_slli_epi32 (
+        (__m256i)_mm256_maskz_cvtepi16_epi32 (
+        (__mmask8)__U, (__m128i)__A), 16));
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtpbh_ps (__m128 __S, __mmask8 __U, __m128bh __A)
+{
+  return (__m128)_mm_castsi128_ps ((__m128i)_mm_mask_slli_epi32 (
+        (__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32 (
+        (__m128i)__A), 16));
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtpbh_ps (__m256 __S, __mmask8 __U, __m128bh __A)
+{
+  return (__m256)_mm256_castsi256_ps ((__m256i)_mm256_mask_slli_epi32 (
+        (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32 (
+        (__m128i)__A), 16));
+}
+
+#ifdef __DISABLE_AVX512BF16VL__
+#undef __DISABLE_AVX512BF16VL__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512BF16VL__ */
+
+#endif /* _AVX512BF16VLINTRIN_H_INCLUDED */
diff --git a/include-gcc/avx512bitalgintrin.h b/include-gcc/avx512bitalgintrin.h
new file mode 100644 (file)
index 0000000..aa6d652
--- /dev/null
@@ -0,0 +1,283 @@
+/* Copyright (C) 2017-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+# error "Never use <avx512bitalgintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _AVX512BITALGINTRIN_H_INCLUDED
+#define _AVX512BITALGINTRIN_H_INCLUDED
+
+#ifndef __AVX512BITALG__
+#pragma GCC push_options
+#pragma GCC target("avx512bitalg")
+#define __DISABLE_AVX512BITALG__
+#endif /* __AVX512BITALG__ */
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_popcnt_epi8 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpopcountb_v64qi ((__v64qi) __A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_popcnt_epi16 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpopcountw_v32hi ((__v32hi) __A);
+}
+
+#ifdef __DISABLE_AVX512BITALG__
+#undef __DISABLE_AVX512BITALG__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512BITALG__ */
+
+#if !defined(__AVX512BITALG__) || !defined(__AVX512BW__)
+#pragma GCC push_options
+#pragma GCC target("avx512bitalg,avx512bw")
+#define __DISABLE_AVX512BITALGBW__
+#endif /* __AVX512VLBW__ */
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_popcnt_epi8 (__m512i __W, __mmask64 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpopcountb_v64qi_mask ((__v64qi) __A,
+                                                        (__v64qi) __W,
+                                                        (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_popcnt_epi8 (__mmask64 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpopcountb_v64qi_mask ((__v64qi) __A,
+                                               (__v64qi)
+                                               _mm512_setzero_si512 (),
+                                               (__mmask64) __U);
+}
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_popcnt_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpopcountw_v32hi_mask ((__v32hi) __A,
+                                                       (__v32hi) __W,
+                                                       (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_popcnt_epi16 (__mmask32 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpopcountw_v32hi_mask ((__v32hi) __A,
+                                               (__v32hi)
+                                               _mm512_setzero_si512 (),
+                                               (__mmask32) __U);
+}
+
+extern __inline __mmask64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_bitshuffle_epi64_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask ((__v64qi) __A,
+                                                (__v64qi) __B,
+                                                (__mmask64) -1);
+}
+
+extern __inline __mmask64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_bitshuffle_epi64_mask (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask ((__v64qi) __A,
+                                                (__v64qi) __B,
+                                                (__mmask64) __M);
+}
+
+#ifdef __DISABLE_AVX512BITALGBW__
+#undef __DISABLE_AVX512BITALGBW__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512BITALGBW__ */
+
+#if !defined(__AVX512BITALG__) || !defined(__AVX512VL__) || !defined(__AVX512BW__)
+#pragma GCC push_options
+#pragma GCC target("avx512bitalg,avx512vl,avx512bw")
+#define __DISABLE_AVX512BITALGVLBW__
+#endif /* __AVX512VLBW__ */
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_popcnt_epi8 (__m256i __W, __mmask32 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpopcountb_v32qi_mask ((__v32qi) __A,
+                                                        (__v32qi) __W,
+                                                        (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_popcnt_epi8 (__mmask32 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpopcountb_v32qi_mask ((__v32qi) __A,
+                                               (__v32qi)
+                                                _mm256_setzero_si256 (),
+                                               (__mmask32) __U);
+}
+
+extern __inline __mmask32
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_bitshuffle_epi64_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask ((__v32qi) __A,
+                                                (__v32qi) __B,
+                                                (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_bitshuffle_epi64_mask (__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask ((__v32qi) __A,
+                                                (__v32qi) __B,
+                                                (__mmask32) __M);
+}
+
+#ifdef __DISABLE_AVX512BITALGVLBW__
+#undef __DISABLE_AVX512BITALGVLBW__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512BITALGVLBW__ */
+
+
+#if !defined(__AVX512BITALG__) || !defined(__AVX512VL__)
+#pragma GCC push_options
+#pragma GCC target("avx512bitalg,avx512vl")
+#define __DISABLE_AVX512BITALGVL__
+#endif /* __AVX512VLBW__ */
+
+extern __inline __mmask16
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_bitshuffle_epi64_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask ((__v16qi) __A,
+                                                (__v16qi) __B,
+                                                (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_bitshuffle_epi64_mask (__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask ((__v16qi) __A,
+                                                (__v16qi) __B,
+                                                (__mmask16) __M);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_popcnt_epi8 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpopcountb_v32qi ((__v32qi) __A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_popcnt_epi16 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpopcountw_v16hi ((__v16hi) __A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_popcnt_epi8 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpopcountb_v16qi ((__v16qi) __A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_popcnt_epi16 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpopcountw_v8hi ((__v8hi) __A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_popcnt_epi16 (__m256i __W, __mmask16 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpopcountw_v16hi_mask ((__v16hi) __A,
+                                                       (__v16hi) __W,
+                                                       (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_popcnt_epi16 (__mmask16 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpopcountw_v16hi_mask ((__v16hi) __A,
+                                               (__v16hi)
+                                               _mm256_setzero_si256 (),
+                                               (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_popcnt_epi8 (__m128i __W, __mmask16 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpopcountb_v16qi_mask ((__v16qi) __A,
+                                                        (__v16qi) __W,
+                                                        (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_popcnt_epi8 (__mmask16 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpopcountb_v16qi_mask ((__v16qi) __A,
+                                                        (__v16qi)
+                                                        _mm_setzero_si128 (),
+                                                        (__mmask16) __U);
+}
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_popcnt_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpopcountw_v8hi_mask ((__v8hi) __A,
+                                                       (__v8hi) __W,
+                                                       (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_popcnt_epi16 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpopcountw_v8hi_mask ((__v8hi) __A,
+                                                       (__v8hi)
+                                                       _mm_setzero_si128 (),
+                                                       (__mmask8) __U);
+}
+#ifdef __DISABLE_AVX512BITALGVL__
+#undef __DISABLE_AVX512BITALGVL__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512BITALGBW__ */
+
+#endif /* _AVX512BITALGINTRIN_H_INCLUDED */
diff --git a/include-gcc/avx512bwintrin.h b/include-gcc/avx512bwintrin.h
new file mode 100644 (file)
index 0000000..89790f7
--- /dev/null
@@ -0,0 +1,3333 @@
+/* Copyright (C) 2014-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512bwintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512BWINTRIN_H_INCLUDED
+#define _AVX512BWINTRIN_H_INCLUDED
+
+#ifndef __AVX512BW__
+#pragma GCC push_options
+#pragma GCC target("avx512bw")
+#define __DISABLE_AVX512BW__
+#endif /* __AVX512BW__ */
+
+/* Internal data types for implementing the intrinsics.  */
+typedef short __v32hi __attribute__ ((__vector_size__ (64)));
+typedef short __v32hi_u __attribute__ ((__vector_size__ (64),  \
+                                       __may_alias__, __aligned__ (1)));
+typedef char __v64qi __attribute__ ((__vector_size__ (64)));
+typedef char __v64qi_u __attribute__ ((__vector_size__ (64),   \
+                                      __may_alias__, __aligned__ (1)));
+
+typedef unsigned long long __mmask64;
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_ktest_mask32_u8  (__mmask32 __A,  __mmask32 __B, unsigned char *__CF)
+{
+  *__CF = (unsigned char) __builtin_ia32_ktestcsi (__A, __B);
+  return (unsigned char) __builtin_ia32_ktestzsi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_ktest_mask64_u8  (__mmask64 __A,  __mmask64 __B, unsigned char *__CF)
+{
+  *__CF = (unsigned char) __builtin_ia32_ktestcdi (__A, __B);
+  return (unsigned char) __builtin_ia32_ktestzdi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_ktestz_mask32_u8 (__mmask32 __A, __mmask32 __B)
+{
+  return (unsigned char) __builtin_ia32_ktestzsi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_ktestz_mask64_u8 (__mmask64 __A, __mmask64 __B)
+{
+  return (unsigned char) __builtin_ia32_ktestzdi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_ktestc_mask32_u8 (__mmask32 __A, __mmask32 __B)
+{
+  return (unsigned char) __builtin_ia32_ktestcsi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_ktestc_mask64_u8 (__mmask64 __A, __mmask64 __B)
+{
+  return (unsigned char) __builtin_ia32_ktestcdi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kortest_mask32_u8  (__mmask32 __A,  __mmask32 __B, unsigned char *__CF)
+{
+  *__CF = (unsigned char) __builtin_ia32_kortestcsi (__A, __B);
+  return (unsigned char) __builtin_ia32_kortestzsi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kortest_mask64_u8  (__mmask64 __A,  __mmask64 __B, unsigned char *__CF)
+{
+  *__CF = (unsigned char) __builtin_ia32_kortestcdi (__A, __B);
+  return (unsigned char) __builtin_ia32_kortestzdi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kortestz_mask32_u8 (__mmask32 __A, __mmask32 __B)
+{
+  return (unsigned char) __builtin_ia32_kortestzsi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kortestz_mask64_u8 (__mmask64 __A, __mmask64 __B)
+{
+  return (unsigned char) __builtin_ia32_kortestzdi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kortestc_mask32_u8 (__mmask32 __A, __mmask32 __B)
+{
+  return (unsigned char) __builtin_ia32_kortestcsi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kortestc_mask64_u8 (__mmask64 __A, __mmask64 __B)
+{
+  return (unsigned char) __builtin_ia32_kortestcdi (__A, __B);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kadd_mask32 (__mmask32 __A, __mmask32 __B)
+{
+  return (__mmask32) __builtin_ia32_kaddsi ((__mmask32) __A, (__mmask32) __B);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kadd_mask64 (__mmask64 __A, __mmask64 __B)
+{
+  return (__mmask64) __builtin_ia32_kadddi ((__mmask64) __A, (__mmask64) __B);
+}
+
+extern __inline unsigned int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_cvtmask32_u32 (__mmask32 __A)
+{
+  return (unsigned int) __builtin_ia32_kmovd ((__mmask32) __A);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_cvtmask64_u64 (__mmask64 __A)
+{
+  return (unsigned long long) __builtin_ia32_kmovq ((__mmask64) __A);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_cvtu32_mask32 (unsigned int __A)
+{
+  return (__mmask32) __builtin_ia32_kmovd ((__mmask32) __A);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_cvtu64_mask64 (unsigned long long __A)
+{
+  return (__mmask64) __builtin_ia32_kmovq ((__mmask64) __A);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_load_mask32 (__mmask32 *__A)
+{
+  return (__mmask32) __builtin_ia32_kmovd (*__A);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_load_mask64 (__mmask64 *__A)
+{
+  return (__mmask64) __builtin_ia32_kmovq (*(__mmask64 *) __A);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_store_mask32 (__mmask32 *__A, __mmask32 __B)
+{
+  *(__mmask32 *) __A = __builtin_ia32_kmovd (__B);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_store_mask64 (__mmask64 *__A, __mmask64 __B)
+{
+  *(__mmask64 *) __A = __builtin_ia32_kmovq (__B);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_knot_mask32 (__mmask32 __A)
+{
+  return (__mmask32) __builtin_ia32_knotsi ((__mmask32) __A);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_knot_mask64 (__mmask64 __A)
+{
+  return (__mmask64) __builtin_ia32_knotdi ((__mmask64) __A);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kor_mask32 (__mmask32 __A, __mmask32 __B)
+{
+  return (__mmask32) __builtin_ia32_korsi ((__mmask32) __A, (__mmask32) __B);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kor_mask64 (__mmask64 __A, __mmask64 __B)
+{
+  return (__mmask64) __builtin_ia32_kordi ((__mmask64) __A, (__mmask64) __B);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kxnor_mask32 (__mmask32 __A, __mmask32 __B)
+{
+  return (__mmask32) __builtin_ia32_kxnorsi ((__mmask32) __A, (__mmask32) __B);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kxnor_mask64 (__mmask64 __A, __mmask64 __B)
+{
+  return (__mmask64) __builtin_ia32_kxnordi ((__mmask64) __A, (__mmask64) __B);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kxor_mask32 (__mmask32 __A, __mmask32 __B)
+{
+  return (__mmask32) __builtin_ia32_kxorsi ((__mmask32) __A, (__mmask32) __B);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kxor_mask64 (__mmask64 __A, __mmask64 __B)
+{
+  return (__mmask64) __builtin_ia32_kxordi ((__mmask64) __A, (__mmask64) __B);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kand_mask32 (__mmask32 __A, __mmask32 __B)
+{
+  return (__mmask32) __builtin_ia32_kandsi ((__mmask32) __A, (__mmask32) __B);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kand_mask64 (__mmask64 __A, __mmask64 __B)
+{
+  return (__mmask64) __builtin_ia32_kanddi ((__mmask64) __A, (__mmask64) __B);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kandn_mask32 (__mmask32 __A, __mmask32 __B)
+{
+  return (__mmask32) __builtin_ia32_kandnsi ((__mmask32) __A, (__mmask32) __B);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kandn_mask64 (__mmask64 __A, __mmask64 __B)
+{
+  return (__mmask64) __builtin_ia32_kandndi ((__mmask64) __A, (__mmask64) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_movdquhi512_mask ((__v32hi) __A,
+                                                   (__v32hi) __W,
+                                                   (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mov_epi16 (__mmask32 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_movdquhi512_mask ((__v32hi) __A,
+                                                   (__v32hi)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_loadu_epi16 (void const *__P)
+{
+  return (__m512i) (*(__v32hi_u *) __P);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_loadu_epi16 (__m512i __W, __mmask32 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddquhi512_mask ((const short *) __P,
+                                                    (__v32hi) __W,
+                                                    (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_loadu_epi16 (__mmask32 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddquhi512_mask ((const short *) __P,
+                                                    (__v32hi)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask32) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_storeu_epi16 (void *__P, __m512i __A)
+{
+  *(__v32hi_u *) __P = (__v32hi_u) __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_storeu_epi16 (void *__P, __mmask32 __U, __m512i __A)
+{
+  __builtin_ia32_storedquhi512_mask ((short *) __P,
+                                    (__v32hi) __A,
+                                    (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mov_epi8 (__m512i __W, __mmask64 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_movdquqi512_mask ((__v64qi) __A,
+                                                   (__v64qi) __W,
+                                                   (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_movdquqi512_mask ((__v64qi) __A,
+                                                   (__v64qi)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask64) __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_kunpackw (__mmask32 __A, __mmask32 __B)
+{
+  return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A,
+                                             (__mmask32) __B);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kunpackw_mask32 (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A,
+                                             (__mmask32) __B);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_kunpackd (__mmask64 __A, __mmask64 __B)
+{
+  return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A,
+                                             (__mmask64) __B);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kunpackd_mask64 (__mmask32 __A, __mmask32 __B)
+{
+  return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A,
+                                             (__mmask64) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_loadu_epi8 (void const *__P)
+{
+  return (__m512i) (*(__v64qi_u *) __P);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_loadu_epi8 (__m512i __W, __mmask64 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddquqi512_mask ((const char *) __P,
+                                                    (__v64qi) __W,
+                                                    (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_loadu_epi8 (__mmask64 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddquqi512_mask ((const char *) __P,
+                                                    (__v64qi)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask64) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_storeu_epi8 (void *__P, __m512i __A)
+{
+  *(__v64qi_u *) __P = (__v64qi_u) __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_storeu_epi8 (void *__P, __mmask64 __U, __m512i __A)
+{
+  __builtin_ia32_storedquqi512_mask ((char *) __P,
+                                    (__v64qi) __A,
+                                    (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sad_epu8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psadbw512 ((__v64qi) __A,
+                                            (__v64qi) __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi16_epi8 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
+                                                 (__v32qi) _mm256_undefined_si256(),
+                                                 (__mmask32) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
+{
+  __builtin_ia32_pmovwb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
+                                                 (__v32qi) __O, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi16_epi8 (__mmask32 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A,
+                                                 (__v32qi)
+                                                 _mm256_setzero_si256 (),
+                                                 __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtsepi16_epi8 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
+                                                  (__v32qi)_mm256_undefined_si256(),
+                                                  (__mmask32) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
+{
+  __builtin_ia32_pmovswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
+                                                  (__v32qi)__O,
+                                                  __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtsepi16_epi8 (__mmask32 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A,
+                                                  (__v32qi)
+                                                  _mm256_setzero_si256 (),
+                                                  __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtusepi16_epi8 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
+                                                   (__v32qi)_mm256_undefined_si256(),
+                                                   (__mmask32) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
+                                                   (__v32qi) __O,
+                                                   __M);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A)
+{
+  __builtin_ia32_pmovuswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtusepi16_epi8 (__mmask32 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A,
+                                                   (__v32qi)
+                                                   _mm256_setzero_si256 (),
+                                                   __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcastb_epi8 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastb512_mask ((__v16qi) __A,
+                                                      (__v64qi)_mm512_undefined_epi32(),
+                                                      (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcastb_epi8 (__m512i __O, __mmask64 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastb512_mask ((__v16qi) __A,
+                                                      (__v64qi) __O,
+                                                      __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcastb_epi8 (__mmask64 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastb512_mask ((__v16qi) __A,
+                                                      (__v64qi)
+                                                      _mm512_setzero_si512 (),
+                                                      __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastb512_gpr_mask (__A,
+                                                          (__v64qi) __O,
+                                                          __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_set1_epi8 (__mmask64 __M, char __A)
+{
+  return (__m512i)
+        __builtin_ia32_pbroadcastb512_gpr_mask (__A,
+                                                (__v64qi)
+                                                _mm512_setzero_si512 (),
+                                                __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcastw_epi16 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastw512_mask ((__v8hi) __A,
+                                                      (__v32hi)_mm512_undefined_epi32(),
+                                                      (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcastw_epi16 (__m512i __O, __mmask32 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastw512_mask ((__v8hi) __A,
+                                                      (__v32hi) __O,
+                                                      __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcastw_epi16 (__mmask32 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastw512_mask ((__v8hi) __A,
+                                                      (__v32hi)
+                                                      _mm512_setzero_si512 (),
+                                                      __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastw512_gpr_mask (__A,
+                                                          (__v32hi) __O,
+                                                          __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_set1_epi16 (__mmask32 __M, short __A)
+{
+  return (__m512i)
+        __builtin_ia32_pbroadcastw512_gpr_mask (__A,
+                                                (__v32hi)
+                                                _mm512_setzero_si512 (),
+                                                __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mulhrs_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A,
+                                                   (__v32hi) __B,
+                                                   (__v32hi)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mulhrs_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+                         __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A,
+                                                   (__v32hi) __B,
+                                                   (__v32hi) __W,
+                                                   (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mulhrs_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A,
+                                                   (__v32hi) __B,
+                                                   (__v32hi)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mulhi_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A,
+                                                 (__v32hi) __B,
+                                                 (__v32hi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mulhi_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+                        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A,
+                                                 (__v32hi) __B,
+                                                 (__v32hi) __W,
+                                                 (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mulhi_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A,
+                                                 (__v32hi) __B,
+                                                 (__v32hi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mulhi_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A,
+                                                  (__v32hi) __B,
+                                                  (__v32hi)
+                                                  _mm512_setzero_si512 (),
+                                                  (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mulhi_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
+                        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A,
+                                                  (__v32hi) __B,
+                                                  (__v32hi) __W,
+                                                  (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mulhi_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A,
+                                                  (__v32hi) __B,
+                                                  (__v32hi)
+                                                  _mm512_setzero_si512 (),
+                                                  (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mullo_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v32hu) __A * (__v32hu) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mullo_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+                        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A,
+                                                 (__v32hi) __B,
+                                                 (__v32hi) __W,
+                                                 (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mullo_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A,
+                                                 (__v32hi) __B,
+                                                 (__v32hi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi8_epi16 (__m256i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A,
+                                                   (__v32hi)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi8_epi16 (__m512i __W, __mmask32 __U, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A,
+                                                   (__v32hi) __W,
+                                                   (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi8_epi16 (__mmask32 __U, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A,
+                                                   (__v32hi)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu8_epi16 (__m256i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A,
+                                                   (__v32hi)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu8_epi16 (__m512i __W, __mmask32 __U, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A,
+                                                   (__v32hi) __W,
+                                                   (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu8_epi16 (__mmask32 __U, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A,
+                                                   (__v32hi)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutexvar_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
+                                                    (__v32hi) __A,
+                                                    (__v32hi)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutexvar_epi16 (__mmask32 __M, __m512i __A,
+                               __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
+                                                    (__v32hi) __A,
+                                                    (__v32hi)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask32) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutexvar_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
+                              __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
+                                                    (__v32hi) __A,
+                                                    (__v32hi) __W,
+                                                    (__mmask32) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutex2var_epi16 (__m512i __A, __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varhi512_mask ((__v32hi) __I
+                                                       /* idx */ ,
+                                                       (__v32hi) __A,
+                                                       (__v32hi) __B,
+                                                       (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutex2var_epi16 (__m512i __A, __mmask32 __U,
+                               __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varhi512_mask ((__v32hi) __I
+                                                       /* idx */ ,
+                                                       (__v32hi) __A,
+                                                       (__v32hi) __B,
+                                                       (__mmask32)
+                                                       __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask2_permutex2var_epi16 (__m512i __A, __m512i __I,
+                                __mmask32 __U, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermi2varhi512_mask ((__v32hi) __A,
+                                                       (__v32hi) __I
+                                                       /* idx */ ,
+                                                       (__v32hi) __B,
+                                                       (__mmask32)
+                                                       __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutex2var_epi16 (__mmask32 __U, __m512i __A,
+                                __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varhi512_maskz ((__v32hi) __I
+                                                        /* idx */ ,
+                                                        (__v32hi) __A,
+                                                        (__v32hi) __B,
+                                                        (__mmask32)
+                                                        __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_avg_epu8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
+                                                (__v64qi) __B,
+                                                (__v64qi)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_avg_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
+                     __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
+                                                (__v64qi) __B,
+                                                (__v64qi) __W,
+                                                (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_avg_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A,
+                                                (__v64qi) __B,
+                                                (__v64qi)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_add_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v64qu) __A + (__v64qu) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_add_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+                     __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A,
+                                                (__v64qi) __B,
+                                                (__v64qi) __W,
+                                                (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_add_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A,
+                                                (__v64qi) __B,
+                                                (__v64qi)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sub_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v64qu) __A - (__v64qu) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sub_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+                     __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A,
+                                                (__v64qi) __B,
+                                                (__v64qi) __W,
+                                                (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sub_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A,
+                                                (__v64qi) __B,
+                                                (__v64qi)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_avg_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
+                                                (__v32hi) __B,
+                                                (__v32hi)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_avg_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
+                      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
+                                                (__v32hi) __B,
+                                                (__v32hi) __W,
+                                                (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_avg_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A,
+                                                (__v32hi) __B,
+                                                (__v32hi)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_subs_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A,
+                                                 (__v64qi) __B,
+                                                 (__v64qi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_subs_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+                      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A,
+                                                 (__v64qi) __B,
+                                                 (__v64qi) __W,
+                                                 (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_subs_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A,
+                                                 (__v64qi) __B,
+                                                 (__v64qi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_subs_epu8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A,
+                                                  (__v64qi) __B,
+                                                  (__v64qi)
+                                                  _mm512_setzero_si512 (),
+                                                  (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_subs_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
+                      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A,
+                                                  (__v64qi) __B,
+                                                  (__v64qi) __W,
+                                                  (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_subs_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A,
+                                                  (__v64qi) __B,
+                                                  (__v64qi)
+                                                  _mm512_setzero_si512 (),
+                                                  (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_adds_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A,
+                                                 (__v64qi) __B,
+                                                 (__v64qi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_adds_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+                      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A,
+                                                 (__v64qi) __B,
+                                                 (__v64qi) __W,
+                                                 (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_adds_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A,
+                                                 (__v64qi) __B,
+                                                 (__v64qi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_adds_epu8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A,
+                                                  (__v64qi) __B,
+                                                  (__v64qi)
+                                                  _mm512_setzero_si512 (),
+                                                  (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_adds_epu8 (__m512i __W, __mmask64 __U, __m512i __A,
+                      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A,
+                                                  (__v64qi) __B,
+                                                  (__v64qi) __W,
+                                                  (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_adds_epu8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A,
+                                                  (__v64qi) __B,
+                                                  (__v64qi)
+                                                  _mm512_setzero_si512 (),
+                                                  (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sub_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v32hu) __A - (__v32hu) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sub_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+                      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A,
+                                                (__v32hi) __B,
+                                                (__v32hi) __W,
+                                                (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sub_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A,
+                                                (__v32hi) __B,
+                                                (__v32hi)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_subs_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A,
+                                                 (__v32hi) __B,
+                                                 (__v32hi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_subs_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+                       __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A,
+                                                 (__v32hi) __B,
+                                                 (__v32hi) __W,
+                                                 (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_subs_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A,
+                                                 (__v32hi) __B,
+                                                 (__v32hi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_subs_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A,
+                                                  (__v32hi) __B,
+                                                  (__v32hi)
+                                                  _mm512_setzero_si512 (),
+                                                  (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_subs_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
+                       __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A,
+                                                  (__v32hi) __B,
+                                                  (__v32hi) __W,
+                                                  (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A,
+                                                  (__v32hi) __B,
+                                                  (__v32hi)
+                                                  _mm512_setzero_si512 (),
+                                                  (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_add_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v32hu) __A + (__v32hu) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_add_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+                      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A,
+                                                (__v32hi) __B,
+                                                (__v32hi) __W,
+                                                (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_add_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A,
+                                                (__v32hi) __B,
+                                                (__v32hi)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_adds_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A,
+                                                 (__v32hi) __B,
+                                                 (__v32hi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_adds_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+                       __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A,
+                                                 (__v32hi) __B,
+                                                 (__v32hi) __W,
+                                                 (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_adds_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A,
+                                                 (__v32hi) __B,
+                                                 (__v32hi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_adds_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A,
+                                                  (__v32hi) __B,
+                                                  (__v32hi)
+                                                  _mm512_setzero_si512 (),
+                                                  (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_adds_epu16 (__m512i __W, __mmask32 __U, __m512i __A,
+                       __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A,
+                                                  (__v32hi) __B,
+                                                  (__v32hi) __W,
+                                                  (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A,
+                                                  (__v32hi) __B,
+                                                  (__v32hi)
+                                                  _mm512_setzero_si512 (),
+                                                  (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srl_epi16 (__m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A,
+                                                (__v8hi) __B,
+                                                (__v32hi)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srl_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+                      __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A,
+                                                (__v8hi) __B,
+                                                (__v32hi) __W,
+                                                (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srl_epi16 (__mmask32 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A,
+                                                (__v8hi) __B,
+                                                (__v32hi)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_packs_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
+                                                   (__v32hi) __B,
+                                                   (__v64qi)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sll_epi16 (__m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A,
+                                                (__v8hi) __B,
+                                                (__v32hi)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sll_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+                      __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A,
+                                                (__v8hi) __B,
+                                                (__v32hi) __W,
+                                                (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sll_epi16 (__mmask32 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A,
+                                                (__v8hi) __B,
+                                                (__v32hi)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maddubs_epi16 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X,
+                                                    (__v64qi) __Y,
+                                                    (__v32hi)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_maddubs_epi16 (__m512i __W, __mmask32 __U, __m512i __X,
+                          __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X,
+                                                    (__v64qi) __Y,
+                                                    (__v32hi) __W,
+                                                    (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_maddubs_epi16 (__mmask32 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X,
+                                                    (__v64qi) __Y,
+                                                    (__v32hi)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_madd_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A,
+                                                  (__v32hi) __B,
+                                                  (__v16si)
+                                                  _mm512_setzero_si512 (),
+                                                  (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_madd_epi16 (__m512i __W, __mmask16 __U, __m512i __A,
+                       __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A,
+                                                  (__v32hi) __B,
+                                                  (__v16si) __W,
+                                                  (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_madd_epi16 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A,
+                                                  (__v32hi) __B,
+                                                  (__v16si)
+                                                  _mm512_setzero_si512 (),
+                                                  (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_unpackhi_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A,
+                                                    (__v64qi) __B,
+                                                    (__v64qi)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_unpackhi_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+                          __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A,
+                                                    (__v64qi) __B,
+                                                    (__v64qi) __W,
+                                                    (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_unpackhi_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A,
+                                                    (__v64qi) __B,
+                                                    (__v64qi)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_unpackhi_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A,
+                                                    (__v32hi) __B,
+                                                    (__v32hi)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_unpackhi_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+                           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A,
+                                                    (__v32hi) __B,
+                                                    (__v32hi) __W,
+                                                    (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_unpackhi_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A,
+                                                    (__v32hi) __B,
+                                                    (__v32hi)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_unpacklo_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A,
+                                                    (__v64qi) __B,
+                                                    (__v64qi)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_unpacklo_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+                          __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A,
+                                                    (__v64qi) __B,
+                                                    (__v64qi) __W,
+                                                    (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_unpacklo_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A,
+                                                    (__v64qi) __B,
+                                                    (__v64qi)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_unpacklo_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A,
+                                                    (__v32hi) __B,
+                                                    (__v32hi)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_unpacklo_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+                           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A,
+                                                    (__v32hi) __B,
+                                                    (__v32hi) __W,
+                                                    (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_unpacklo_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A,
+                                                    (__v32hi) __B,
+                                                    (__v32hi)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask32) __U);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpeq_epu8_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __A,
+                                                   (__v64qi) __B, 0,
+                                                   (__mmask64) -1);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpeq_epi8_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_pcmpeqb512_mask ((__v64qi) __A,
+                                                    (__v64qi) __B,
+                                                    (__mmask64) -1);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpeq_epu8_mask (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __A,
+                                                   (__v64qi) __B, 0,
+                                                   __U);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpeq_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_pcmpeqb512_mask ((__v64qi) __A,
+                                                    (__v64qi) __B,
+                                                    __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpeq_epu16_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __A,
+                                                   (__v32hi) __B, 0,
+                                                   (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpeq_epi16_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask32) __builtin_ia32_pcmpeqw512_mask ((__v32hi) __A,
+                                                    (__v32hi) __B,
+                                                    (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpeq_epu16_mask (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __A,
+                                                   (__v32hi) __B, 0,
+                                                   __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpeq_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask32) __builtin_ia32_pcmpeqw512_mask ((__v32hi) __A,
+                                                    (__v32hi) __B,
+                                                    __U);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpgt_epu8_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __A,
+                                                   (__v64qi) __B, 6,
+                                                   (__mmask64) -1);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpgt_epi8_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_pcmpgtb512_mask ((__v64qi) __A,
+                                                    (__v64qi) __B,
+                                                    (__mmask64) -1);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpgt_epu8_mask (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __A,
+                                                   (__v64qi) __B, 6,
+                                                   __U);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpgt_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_pcmpgtb512_mask ((__v64qi) __A,
+                                                    (__v64qi) __B,
+                                                    __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpgt_epu16_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __A,
+                                                   (__v32hi) __B, 6,
+                                                   (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpgt_epi16_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask32) __builtin_ia32_pcmpgtw512_mask ((__v32hi) __A,
+                                                    (__v32hi) __B,
+                                                    (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpgt_epu16_mask (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __A,
+                                                   (__v32hi) __B, 6,
+                                                   __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpgt_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask32) __builtin_ia32_pcmpgtw512_mask ((__v32hi) __A,
+                                                    (__v32hi) __B,
+                                                    __U);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_movepi8_mask (__m512i __A)
+{
+  return (__mmask64) __builtin_ia32_cvtb2mask512 ((__v64qi) __A);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_movepi16_mask (__m512i __A)
+{
+  return (__mmask32) __builtin_ia32_cvtw2mask512 ((__v32hi) __A);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_movm_epi8 (__mmask64 __A)
+{
+  return (__m512i) __builtin_ia32_cvtmask2b512 (__A);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_movm_epi16 (__mmask32 __A)
+{
+  return (__m512i) __builtin_ia32_cvtmask2w512 (__A);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_test_epi8_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A,
+                                               (__v64qi) __B,
+                                               (__mmask64) -1);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_test_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A,
+                                               (__v64qi) __B, __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_test_epi16_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A,
+                                               (__v32hi) __B,
+                                               (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_test_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A,
+                                               (__v32hi) __B, __U);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_testn_epi8_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A,
+                                                (__v64qi) __B,
+                                                (__mmask64) -1);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_testn_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A,
+                                                (__v64qi) __B, __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_testn_epi16_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A,
+                                                (__v32hi) __B,
+                                                (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_testn_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A,
+                                                (__v32hi) __B, __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shuffle_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A,
+                                                 (__v64qi) __B,
+                                                 (__v64qi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shuffle_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+                         __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A,
+                                                 (__v64qi) __B,
+                                                 (__v64qi) __W,
+                                                 (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shuffle_epi8 (__mmask64 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A,
+                                                 (__v64qi) __B,
+                                                 (__v64qi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A,
+                                                 (__v32hi) __B,
+                                                 (__v32hi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_epu16 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A,
+                                                 (__v32hi) __B,
+                                                 (__v32hi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask32) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_epu16 (__m512i __W, __mmask32 __M, __m512i __A,
+                      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A,
+                                                 (__v32hi) __B,
+                                                 (__v32hi) __W,
+                                                 (__mmask32) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A,
+                                                 (__v32hi) __B,
+                                                 (__v32hi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_epi16 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A,
+                                                 (__v32hi) __B,
+                                                 (__v32hi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask32) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
+                      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A,
+                                                 (__v32hi) __B,
+                                                 (__v32hi) __W,
+                                                 (__mmask32) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_epu8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A,
+                                                 (__v64qi) __B,
+                                                 (__v64qi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_epu8 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A,
+                                                 (__v64qi) __B,
+                                                 (__v64qi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask64) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_epu8 (__m512i __W, __mmask64 __M, __m512i __A,
+                     __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A,
+                                                 (__v64qi) __B,
+                                                 (__v64qi) __W,
+                                                 (__mmask64) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A,
+                                                 (__v64qi) __B,
+                                                 (__v64qi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_epi8 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A,
+                                                 (__v64qi) __B,
+                                                 (__v64qi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask64) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
+                     __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A,
+                                                 (__v64qi) __B,
+                                                 (__v64qi) __W,
+                                                 (__mmask64) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_epu8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A,
+                                                 (__v64qi) __B,
+                                                 (__v64qi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_epu8 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A,
+                                                 (__v64qi) __B,
+                                                 (__v64qi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask64) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_epu8 (__m512i __W, __mmask64 __M, __m512i __A,
+                     __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A,
+                                                 (__v64qi) __B,
+                                                 (__v64qi) __W,
+                                                 (__mmask64) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A,
+                                                 (__v64qi) __B,
+                                                 (__v64qi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_epi8 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A,
+                                                 (__v64qi) __B,
+                                                 (__v64qi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask64) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
+                     __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A,
+                                                 (__v64qi) __B,
+                                                 (__v64qi) __W,
+                                                 (__mmask64) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A,
+                                                 (__v32hi) __B,
+                                                 (__v32hi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_epi16 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A,
+                                                 (__v32hi) __B,
+                                                 (__v32hi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask32) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_epi16 (__m512i __W, __mmask32 __M, __m512i __A,
+                      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A,
+                                                 (__v32hi) __B,
+                                                 (__v32hi) __W,
+                                                 (__mmask32) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_epu16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A,
+                                                 (__v32hi) __B,
+                                                 (__v32hi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_epu16 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A,
+                                                 (__v32hi) __B,
+                                                 (__v32hi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask32) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_epu16 (__m512i __W, __mmask32 __M, __m512i __A,
+                      __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A,
+                                                 (__v32hi) __B,
+                                                 (__v32hi) __W,
+                                                 (__mmask32) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sra_epi16 (__m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A,
+                                                (__v8hi) __B,
+                                                (__v32hi)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sra_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+                      __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A,
+                                                (__v8hi) __B,
+                                                (__v32hi) __W,
+                                                (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sra_epi16 (__mmask32 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A,
+                                                (__v8hi) __B,
+                                                (__v32hi)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srav_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A,
+                                                 (__v32hi) __B,
+                                                 (__v32hi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srav_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+                       __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A,
+                                                 (__v32hi) __B,
+                                                 (__v32hi) __W,
+                                                 (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srav_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A,
+                                                 (__v32hi) __B,
+                                                 (__v32hi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srlv_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A,
+                                                 (__v32hi) __B,
+                                                 (__v32hi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srlv_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+                       __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A,
+                                                 (__v32hi) __B,
+                                                 (__v32hi) __W,
+                                                 (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srlv_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A,
+                                                 (__v32hi) __B,
+                                                 (__v32hi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sllv_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A,
+                                                 (__v32hi) __B,
+                                                 (__v32hi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sllv_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+                       __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A,
+                                                 (__v32hi) __B,
+                                                 (__v32hi) __W,
+                                                 (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sllv_epi16 (__mmask32 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A,
+                                                 (__v32hi) __B,
+                                                 (__v32hi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_packs_epi16 (__m512i __W, __mmask64 __M, __m512i __A,
+                        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
+                                                   (__v32hi) __B,
+                                                   (__v64qi) __W,
+                                                   (__mmask64) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_packs_epi16 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A,
+                                                   (__v32hi) __B,
+                                                   (__v64qi)
+                                                   _mm512_setzero_si512 (),
+                                                   __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_packus_epi16 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
+                                                   (__v32hi) __B,
+                                                   (__v64qi)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_packus_epi16 (__m512i __W, __mmask64 __M, __m512i __A,
+                         __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
+                                                   (__v32hi) __B,
+                                                   (__v64qi) __W,
+                                                   (__mmask64) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_packus_epi16 (__mmask64 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A,
+                                                   (__v32hi) __B,
+                                                   (__v64qi)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask64) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_abs_epi8 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A,
+                                                (__v64qi)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_abs_epi8 (__m512i __W, __mmask64 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A,
+                                                (__v64qi) __W,
+                                                (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_abs_epi8 (__mmask64 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A,
+                                                (__v64qi)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_abs_epi16 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A,
+                                                (__v32hi)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_abs_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A,
+                                                (__v32hi) __W,
+                                                (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A,
+                                                (__v32hi)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask32) __U);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpneq_epu8_mask (__mmask64 __M, __m512i __X, __m512i __Y)
+{
+  return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
+                                                  (__v64qi) __Y, 4,
+                                                  (__mmask64) __M);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmplt_epu8_mask (__mmask64 __M, __m512i __X, __m512i __Y)
+{
+  return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
+                                                  (__v64qi) __Y, 1,
+                                                  (__mmask64) __M);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpge_epu8_mask (__mmask64 __M, __m512i __X, __m512i __Y)
+{
+  return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
+                                                  (__v64qi) __Y, 5,
+                                                  (__mmask64) __M);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmple_epu8_mask (__mmask64 __M, __m512i __X, __m512i __Y)
+{
+  return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
+                                                  (__v64qi) __Y, 2,
+                                                  (__mmask64) __M);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpneq_epu16_mask (__mmask32 __M, __m512i __X, __m512i __Y)
+{
+  return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
+                                                  (__v32hi) __Y, 4,
+                                                  (__mmask32) __M);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmplt_epu16_mask (__mmask32 __M, __m512i __X, __m512i __Y)
+{
+  return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
+                                                  (__v32hi) __Y, 1,
+                                                  (__mmask32) __M);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpge_epu16_mask (__mmask32 __M, __m512i __X, __m512i __Y)
+{
+  return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
+                                                  (__v32hi) __Y, 5,
+                                                  (__mmask32) __M);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmple_epu16_mask (__mmask32 __M, __m512i __X, __m512i __Y)
+{
+  return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
+                                                  (__v32hi) __Y, 2,
+                                                  (__mmask32) __M);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpneq_epi8_mask (__mmask64 __M, __m512i __X, __m512i __Y)
+{
+  return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
+                                                 (__v64qi) __Y, 4,
+                                                 (__mmask64) __M);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmplt_epi8_mask (__mmask64 __M, __m512i __X, __m512i __Y)
+{
+  return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
+                                                 (__v64qi) __Y, 1,
+                                                 (__mmask64) __M);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpge_epi8_mask (__mmask64 __M, __m512i __X, __m512i __Y)
+{
+  return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
+                                                 (__v64qi) __Y, 5,
+                                                 (__mmask64) __M);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmple_epi8_mask (__mmask64 __M, __m512i __X, __m512i __Y)
+{
+  return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
+                                                 (__v64qi) __Y, 2,
+                                                 (__mmask64) __M);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpneq_epi16_mask (__mmask32 __M, __m512i __X, __m512i __Y)
+{
+  return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
+                                                 (__v32hi) __Y, 4,
+                                                 (__mmask32) __M);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmplt_epi16_mask (__mmask32 __M, __m512i __X, __m512i __Y)
+{
+  return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
+                                                 (__v32hi) __Y, 1,
+                                                 (__mmask32) __M);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpge_epi16_mask (__mmask32 __M, __m512i __X, __m512i __Y)
+{
+  return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
+                                                 (__v32hi) __Y, 5,
+                                                 (__mmask32) __M);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmple_epi16_mask (__mmask32 __M, __m512i __X, __m512i __Y)
+{
+  return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
+                                                 (__v32hi) __Y, 2,
+                                                 (__mmask32) __M);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpneq_epu8_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
+                                                  (__v64qi) __Y, 4,
+                                                  (__mmask64) -1);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmplt_epu8_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
+                                                  (__v64qi) __Y, 1,
+                                                  (__mmask64) -1);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpge_epu8_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
+                                                  (__v64qi) __Y, 5,
+                                                  (__mmask64) -1);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmple_epu8_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
+                                                  (__v64qi) __Y, 2,
+                                                  (__mmask64) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpneq_epu16_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
+                                                  (__v32hi) __Y, 4,
+                                                  (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmplt_epu16_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
+                                                  (__v32hi) __Y, 1,
+                                                  (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpge_epu16_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
+                                                  (__v32hi) __Y, 5,
+                                                  (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmple_epu16_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
+                                                  (__v32hi) __Y, 2,
+                                                  (__mmask32) -1);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpneq_epi8_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
+                                                 (__v64qi) __Y, 4,
+                                                 (__mmask64) -1);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmplt_epi8_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
+                                                 (__v64qi) __Y, 1,
+                                                 (__mmask64) -1);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpge_epi8_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
+                                                 (__v64qi) __Y, 5,
+                                                 (__mmask64) -1);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmple_epi8_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
+                                                 (__v64qi) __Y, 2,
+                                                 (__mmask64) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpneq_epi16_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
+                                                 (__v32hi) __Y, 4,
+                                                 (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmplt_epi16_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
+                                                 (__v32hi) __Y, 1,
+                                                 (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpge_epi16_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
+                                                 (__v32hi) __Y, 5,
+                                                 (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmple_epi16_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
+                                                 (__v32hi) __Y, 2,
+                                                 (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_packs_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
+                                                   (__v16si) __B,
+                                                   (__v32hi)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_packs_epi32 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
+                                                   (__v16si) __B,
+                                                   (__v32hi)
+                                                   _mm512_setzero_si512 (),
+                                                   __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_packs_epi32 (__m512i __W, __mmask32 __M, __m512i __A,
+                        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A,
+                                                   (__v16si) __B,
+                                                   (__v32hi) __W,
+                                                   __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_packus_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
+                                                   (__v16si) __B,
+                                                   (__v32hi)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_packus_epi32 (__mmask32 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
+                                                   (__v16si) __B,
+                                                   (__v32hi)
+                                                   _mm512_setzero_si512 (),
+                                                   __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_packus_epi32 (__m512i __W, __mmask32 __M, __m512i __A,
+                         __m512i __B)
+{
+  return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A,
+                                                   (__v16si) __B,
+                                                   (__v32hi) __W,
+                                                   __M);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kshiftli_mask32 (__mmask32 __A, unsigned int __B)
+{
+  return (__mmask32) __builtin_ia32_kshiftlisi ((__mmask32) __A,
+                                               (__mmask8) __B);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kshiftli_mask64 (__mmask64 __A, unsigned int __B)
+{
+  return (__mmask64) __builtin_ia32_kshiftlidi ((__mmask64) __A,
+                                               (__mmask8) __B);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kshiftri_mask32 (__mmask32 __A, unsigned int __B)
+{
+  return (__mmask32) __builtin_ia32_kshiftrisi ((__mmask32) __A,
+                                               (__mmask8) __B);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kshiftri_mask64 (__mmask64 __A, unsigned int __B)
+{
+  return (__mmask64) __builtin_ia32_kshiftridi ((__mmask64) __A,
+                                               (__mmask8) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_alignr_epi8 (__m512i __A, __m512i __B, const int __N)
+{
+  return (__m512i) __builtin_ia32_palignr512 ((__v8di) __A,
+                                             (__v8di) __B, __N * 8);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_alignr_epi8 (__m512i __W, __mmask64 __U, __m512i __A,
+                        __m512i __B, const int __N)
+{
+  return (__m512i) __builtin_ia32_palignr512_mask ((__v8di) __A,
+                                                  (__v8di) __B,
+                                                  __N * 8,
+                                                  (__v8di) __W,
+                                                  (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_alignr_epi8 (__mmask64 __U, __m512i __A, __m512i __B,
+                         const int __N)
+{
+  return (__m512i) __builtin_ia32_palignr512_mask ((__v8di) __A,
+                                                  (__v8di) __B,
+                                                  __N * 8,
+                                                  (__v8di)
+                                                  _mm512_setzero_si512 (),
+                                                  (__mmask64) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_dbsad_epu8 (__m512i __A, __m512i __B, const int __imm)
+{
+  return (__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi) __A,
+                                                   (__v64qi) __B,
+                                                   __imm,
+                                                   (__v32hi)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_dbsad_epu8 (__m512i __W, __mmask32 __U, __m512i __A,
+                       __m512i __B, const int __imm)
+{
+  return (__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi) __A,
+                                                   (__v64qi) __B,
+                                                   __imm,
+                                                   (__v32hi) __W,
+                                                   (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_dbsad_epu8 (__mmask32 __U, __m512i __A, __m512i __B,
+                        const int __imm)
+{
+  return (__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi) __A,
+                                                   (__v64qi) __B,
+                                                   __imm,
+                                                   (__v32hi)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srli_epi16 (__m512i __A, const int __imm)
+{
+  return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm,
+                                                 (__v32hi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srli_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+                       const int __imm)
+{
+  return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm,
+                                                 (__v32hi) __W,
+                                                 (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srli_epi16 (__mmask32 __U, __m512i __A, const int __imm)
+{
+  return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm,
+                                                 (__v32hi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_slli_epi16 (__m512i __A, const int __B)
+{
+  return (__m512i) __builtin_ia32_psllwi512_mask ((__v32hi) __A, __B,
+                                                 (__v32hi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_slli_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+                       const int __B)
+{
+  return (__m512i) __builtin_ia32_psllwi512_mask ((__v32hi) __A, __B,
+                                                 (__v32hi) __W,
+                                                 (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_slli_epi16 (__mmask32 __U, __m512i __A, const int __B)
+{
+  return (__m512i) __builtin_ia32_psllwi512_mask ((__v32hi) __A, __B,
+                                                 (__v32hi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shufflehi_epi16 (__m512i __A, const int __imm)
+{
+  return (__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi) __A,
+                                                  __imm,
+                                                  (__v32hi)
+                                                  _mm512_setzero_si512 (),
+                                                  (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shufflehi_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+                            const int __imm)
+{
+  return (__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi) __A,
+                                                  __imm,
+                                                  (__v32hi) __W,
+                                                  (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shufflehi_epi16 (__mmask32 __U, __m512i __A,
+                             const int __imm)
+{
+  return (__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi) __A,
+                                                  __imm,
+                                                  (__v32hi)
+                                                  _mm512_setzero_si512 (),
+                                                  (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shufflelo_epi16 (__m512i __A, const int __imm)
+{
+  return (__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi) __A,
+                                                  __imm,
+                                                  (__v32hi)
+                                                  _mm512_setzero_si512 (),
+                                                  (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shufflelo_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+                            const int __imm)
+{
+  return (__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi) __A,
+                                                  __imm,
+                                                  (__v32hi) __W,
+                                                  (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shufflelo_epi16 (__mmask32 __U, __m512i __A,
+                             const int __imm)
+{
+  return (__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi) __A,
+                                                  __imm,
+                                                  (__v32hi)
+                                                  _mm512_setzero_si512 (),
+                                                  (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srai_epi16 (__m512i __A, const int __imm)
+{
+  return (__m512i) __builtin_ia32_psrawi512_mask ((__v32hi) __A, __imm,
+                                                 (__v32hi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask32) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srai_epi16 (__m512i __W, __mmask32 __U, __m512i __A,
+                       const int __imm)
+{
+  return (__m512i) __builtin_ia32_psrawi512_mask ((__v32hi) __A, __imm,
+                                                 (__v32hi) __W,
+                                                 (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srai_epi16 (__mmask32 __U, __m512i __A, const int __imm)
+{
+  return (__m512i) __builtin_ia32_psrawi512_mask ((__v32hi) __A, __imm,
+                                                 (__v32hi)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_blend_epi16 (__mmask32 __U, __m512i __A, __m512i __W)
+{
+  return (__m512i) __builtin_ia32_blendmw_512_mask ((__v32hi) __A,
+                                                   (__v32hi) __W,
+                                                   (__mmask32) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_blend_epi8 (__mmask64 __U, __m512i __A, __m512i __W)
+{
+  return (__m512i) __builtin_ia32_blendmb_512_mask ((__v64qi) __A,
+                                                   (__v64qi) __W,
+                                                   (__mmask64) __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_epi16_mask (__mmask32 __U, __m512i __X, __m512i __Y,
+                           const int __P)
+{
+  return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
+                                                 (__v32hi) __Y, __P,
+                                                 (__mmask32) __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_epi16_mask (__m512i __X, __m512i __Y, const int __P)
+{
+  return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X,
+                                                 (__v32hi) __Y, __P,
+                                                 (__mmask32) -1);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_epi8_mask (__mmask64 __U, __m512i __X, __m512i __Y,
+                          const int __P)
+{
+  return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
+                                                 (__v64qi) __Y, __P,
+                                                 (__mmask64) __U);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_epi8_mask (__m512i __X, __m512i __Y, const int __P)
+{
+  return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X,
+                                                 (__v64qi) __Y, __P,
+                                                 (__mmask64) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_epu16_mask (__mmask32 __U, __m512i __X, __m512i __Y,
+                           const int __P)
+{
+  return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
+                                                  (__v32hi) __Y, __P,
+                                                  (__mmask32) __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_epu16_mask (__m512i __X, __m512i __Y, const int __P)
+{
+  return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X,
+                                                  (__v32hi) __Y, __P,
+                                                  (__mmask32) -1);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_epu8_mask (__mmask64 __U, __m512i __X, __m512i __Y,
+                          const int __P)
+{
+  return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
+                                                  (__v64qi) __Y, __P,
+                                                  (__mmask64) __U);
+}
+
+extern __inline __mmask64
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_epu8_mask (__m512i __X, __m512i __Y, const int __P)
+{
+  return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X,
+                                                  (__v64qi) __Y, __P,
+                                                  (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_bslli_epi128 (__m512i __A, const int __N)
+{
+  return (__m512i) __builtin_ia32_pslldq512 (__A, __N * 8);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_bsrli_epi128 (__m512i __A, const int __N)
+{
+  return (__m512i) __builtin_ia32_psrldq512 (__A, __N * 8);
+}
+
+#else
+#define _kshiftli_mask32(X, Y)                                                 \
+  ((__mmask32) __builtin_ia32_kshiftlisi ((__mmask32)(X), (__mmask8)(Y)))
+
+#define _kshiftli_mask64(X, Y)                                                 \
+  ((__mmask64) __builtin_ia32_kshiftlidi ((__mmask64)(X), (__mmask8)(Y)))
+
+#define _kshiftri_mask32(X, Y)                                                 \
+  ((__mmask32) __builtin_ia32_kshiftrisi ((__mmask32)(X), (__mmask8)(Y)))
+
+#define _kshiftri_mask64(X, Y)                                                 \
+  ((__mmask64) __builtin_ia32_kshiftridi ((__mmask64)(X), (__mmask8)(Y)))
+
+#define _mm512_alignr_epi8(X, Y, N)                                                \
+  ((__m512i) __builtin_ia32_palignr512 ((__v8di)(__m512i)(X),                      \
+                                       (__v8di)(__m512i)(Y),                       \
+                                       (int)((N) * 8)))
+
+#define _mm512_mask_alignr_epi8(W, U, X, Y, N)                                     \
+  ((__m512i) __builtin_ia32_palignr512_mask ((__v8di)(__m512i)(X),                 \
+                                           (__v8di)(__m512i)(Y), (int)((N) * 8),   \
+                                           (__v8di)(__m512i)(W), (__mmask64)(U)))
+
+#define _mm512_maskz_alignr_epi8(U, X, Y, N)                                       \
+  ((__m512i) __builtin_ia32_palignr512_mask ((__v8di)(__m512i)(X),                 \
+                                            (__v8di)(__m512i)(Y), (int)((N) * 8),  \
+                                            (__v8di)(__m512i)                      \
+                                            _mm512_setzero_si512 (),               \
+                                            (__mmask64)(U)))
+
+#define _mm512_dbsad_epu8(X, Y, C)                                                  \
+  ((__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi)(__m512i) (X),               \
+                                              (__v64qi)(__m512i) (Y), (int) (C),    \
+                                              (__v32hi)(__m512i)                   \
+                                             _mm512_setzero_si512 (),              \
+                                              (__mmask32)-1))
+
+#define _mm512_mask_dbsad_epu8(W, U, X, Y, C)                                       \
+  ((__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi)(__m512i) (X),               \
+                                              (__v64qi)(__m512i) (Y), (int) (C),    \
+                                              (__v32hi)(__m512i)(W),                \
+                                              (__mmask32)(U)))
+
+#define _mm512_maskz_dbsad_epu8(U, X, Y, C)                                         \
+  ((__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi)(__m512i) (X),               \
+                                              (__v64qi)(__m512i) (Y), (int) (C),    \
+                                              (__v32hi)(__m512i)                   \
+                                             _mm512_setzero_si512 (),              \
+                                              (__mmask32)(U)))
+
+#define _mm512_srli_epi16(A, B)                                         \
+  ((__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi)(__m512i)(A),      \
+    (int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)-1))
+
+#define _mm512_mask_srli_epi16(W, U, A, B)                              \
+  ((__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi)(__m512i)(A),      \
+    (int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U)))
+
+#define _mm512_maskz_srli_epi16(U, A, B)                                \
+  ((__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi)(__m512i)(A),      \
+    (int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)(U)))
+
+#define _mm512_slli_epi16(X, C)                                                   \
+  ((__m512i)__builtin_ia32_psllwi512_mask ((__v32hi)(__m512i)(X), (int)(C),\
+    (__v32hi)(__m512i)_mm512_setzero_si512 (),                            \
+    (__mmask32)-1))
+
+#define _mm512_mask_slli_epi16(W, U, X, C)                                 \
+  ((__m512i)__builtin_ia32_psllwi512_mask ((__v32hi)(__m512i)(X), (int)(C),\
+    (__v32hi)(__m512i)(W),\
+    (__mmask32)(U)))
+
+#define _mm512_maskz_slli_epi16(U, X, C)                                   \
+  ((__m512i)__builtin_ia32_psllwi512_mask ((__v32hi)(__m512i)(X), (int)(C),\
+    (__v32hi)(__m512i)_mm512_setzero_si512 (),                            \
+    (__mmask32)(U)))
+
+#define _mm512_shufflehi_epi16(A, B)                                                \
+  ((__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi)(__m512i)(A), (int)(B),       \
+                                             (__v32hi)(__m512i)                            \
+                                            _mm512_setzero_si512 (),               \
+                                             (__mmask32)-1))
+
+#define _mm512_mask_shufflehi_epi16(W, U, A, B)                                     \
+  ((__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi)(__m512i)(A), (int)(B),       \
+                                             (__v32hi)(__m512i)(W),                 \
+                                             (__mmask32)(U)))
+
+#define _mm512_maskz_shufflehi_epi16(U, A, B)                                       \
+  ((__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi)(__m512i)(A), (int)(B),       \
+                                             (__v32hi)(__m512i)                            \
+                                            _mm512_setzero_si512 (),               \
+                                             (__mmask32)(U)))
+
+#define _mm512_shufflelo_epi16(A, B)                                                \
+  ((__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi)(__m512i)(A), (int)(B),       \
+                                             (__v32hi)(__m512i)                            \
+                                            _mm512_setzero_si512 (),               \
+                                             (__mmask32)-1))
+
+#define _mm512_mask_shufflelo_epi16(W, U, A, B)                                     \
+  ((__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi)(__m512i)(A), (int)(B),       \
+                                             (__v32hi)(__m512i)(W),                 \
+                                             (__mmask32)(U)))
+
+#define _mm512_maskz_shufflelo_epi16(U, A, B)                                       \
+  ((__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi)(__m512i)(A), (int)(B),       \
+                                             (__v32hi)(__m512i)                            \
+                                            _mm512_setzero_si512 (),               \
+                                             (__mmask32)(U)))
+
+#define _mm512_srai_epi16(A, B)                                         \
+  ((__m512i) __builtin_ia32_psrawi512_mask ((__v32hi)(__m512i)(A),      \
+    (int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)-1))
+
+#define _mm512_mask_srai_epi16(W, U, A, B)                              \
+  ((__m512i) __builtin_ia32_psrawi512_mask ((__v32hi)(__m512i)(A),      \
+    (int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U)))
+
+#define _mm512_maskz_srai_epi16(U, A, B)                                \
+  ((__m512i) __builtin_ia32_psrawi512_mask ((__v32hi)(__m512i)(A),      \
+    (int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)(U)))
+
+#define _mm512_mask_blend_epi16(__U, __A, __W)                       \
+  ((__m512i) __builtin_ia32_blendmw_512_mask ((__v32hi) (__A),       \
+                                                   (__v32hi) (__W),  \
+                                                   (__mmask32) (__U)))
+
+#define _mm512_mask_blend_epi8(__U, __A, __W)                        \
+  ((__m512i) __builtin_ia32_blendmb_512_mask ((__v64qi) (__A),       \
+                                                   (__v64qi) (__W),  \
+                                                   (__mmask64) (__U)))
+
+#define _mm512_cmp_epi16_mask(X, Y, P)                         \
+  ((__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi)(__m512i)(X),     \
+                                           (__v32hi)(__m512i)(Y), (int)(P),\
+                                           (__mmask32)(-1)))
+
+#define _mm512_cmp_epi8_mask(X, Y, P)                          \
+  ((__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi)(__m512i)(X),     \
+                                           (__v64qi)(__m512i)(Y), (int)(P),\
+                                           (__mmask64)(-1)))
+
+#define _mm512_cmp_epu16_mask(X, Y, P)                         \
+  ((__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi)(__m512i)(X),    \
+                                           (__v32hi)(__m512i)(Y), (int)(P),\
+                                           (__mmask32)(-1)))
+
+#define _mm512_cmp_epu8_mask(X, Y, P)                          \
+  ((__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi)(__m512i)(X),    \
+                                           (__v64qi)(__m512i)(Y), (int)(P),\
+                                           (__mmask64)(-1)))
+
+#define _mm512_mask_cmp_epi16_mask(M, X, Y, P)                         \
+  ((__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi)(__m512i)(X),     \
+                                           (__v32hi)(__m512i)(Y), (int)(P),\
+                                           (__mmask32)(M)))
+
+#define _mm512_mask_cmp_epi8_mask(M, X, Y, P)                          \
+  ((__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi)(__m512i)(X),     \
+                                           (__v64qi)(__m512i)(Y), (int)(P),\
+                                           (__mmask64)(M)))
+
+#define _mm512_mask_cmp_epu16_mask(M, X, Y, P)                         \
+  ((__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi)(__m512i)(X),    \
+                                           (__v32hi)(__m512i)(Y), (int)(P),\
+                                           (__mmask32)(M)))
+
+#define _mm512_mask_cmp_epu8_mask(M, X, Y, P)                          \
+  ((__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi)(__m512i)(X),    \
+                                           (__v64qi)(__m512i)(Y), (int)(P),\
+                                           (__mmask64)(M)))
+
+#define _mm512_bslli_epi128(A, N)                                         \
+  ((__m512i)__builtin_ia32_pslldq512 ((__m512i)(A), (int)(N) * 8))
+
+#define _mm512_bsrli_epi128(A, N)                                         \
+  ((__m512i)__builtin_ia32_psrldq512 ((__m512i)(A), (int)(N) * 8))
+
+#endif
+
+#ifdef __DISABLE_AVX512BW__
+#undef __DISABLE_AVX512BW__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512BW__ */
+
+#endif /* _AVX512BWINTRIN_H_INCLUDED */
diff --git a/include-gcc/avx512cdintrin.h b/include-gcc/avx512cdintrin.h
new file mode 100644 (file)
index 0000000..a5f5eab
--- /dev/null
@@ -0,0 +1,184 @@
+/* Copyright (C) 2013-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512cdintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512CDINTRIN_H_INCLUDED
+#define _AVX512CDINTRIN_H_INCLUDED
+
+#ifndef __AVX512CD__
+#pragma GCC push_options
+#pragma GCC target("avx512cd")
+#define __DISABLE_AVX512CD__
+#endif /* __AVX512CD__ */
+
+/* Internal data types for implementing the intrinsics.  */
+typedef long long __v8di __attribute__ ((__vector_size__ (64)));
+typedef int __v16si __attribute__ ((__vector_size__ (64)));
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+   vector types, and their scalar components.  */
+typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__));
+typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__));
+
+typedef unsigned char  __mmask8;
+typedef unsigned short __mmask16;
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_conflict_epi32 (__m512i __A)
+{
+  return (__m512i)
+        __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
+                                              (__v16si) _mm512_setzero_si512 (),
+                                              (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
+                                                        (__v16si) __W,
+                                                        (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i)
+        __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A,
+                                              (__v16si) _mm512_setzero_si512 (),
+                                              (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_conflict_epi64 (__m512i __A)
+{
+  return (__m512i)
+        __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
+                                              (__v8di) _mm512_setzero_si512 (),
+                                              (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
+                                                        (__v8di) __W,
+                                                        (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i)
+        __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A,
+                                              (__v8di) _mm512_setzero_si512 (),
+                                              (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_lzcnt_epi64 (__m512i __A)
+{
+  return (__m512i)
+        __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
+                                          (__v8di) _mm512_setzero_si512 (),
+                                          (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
+                                                    (__v8di) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i)
+        __builtin_ia32_vplzcntq_512_mask ((__v8di) __A,
+                                          (__v8di) _mm512_setzero_si512 (),
+                                          (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_lzcnt_epi32 (__m512i __A)
+{
+  return (__m512i)
+        __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
+                                          (__v16si) _mm512_setzero_si512 (),
+                                          (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
+                                                    (__v16si) __W,
+                                                    (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i)
+        __builtin_ia32_vplzcntd_512_mask ((__v16si) __A,
+                                          (__v16si) _mm512_setzero_si512 (),
+                                          (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcastmb_epi64 (__mmask8 __A)
+{
+  return (__m512i) __builtin_ia32_broadcastmb512 (__A);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcastmw_epi32 (__mmask16 __A)
+{
+  return (__m512i) __builtin_ia32_broadcastmw512 (__A);
+}
+
+#ifdef __DISABLE_AVX512CD__
+#undef __DISABLE_AVX512CD__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512CD__ */
+
+#endif /* _AVX512CDINTRIN_H_INCLUDED */
diff --git a/include-gcc/avx512dqintrin.h b/include-gcc/avx512dqintrin.h
new file mode 100644 (file)
index 0000000..93900a0
--- /dev/null
@@ -0,0 +1,2891 @@
+/* Copyright (C) 2014-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512dqintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512DQINTRIN_H_INCLUDED
+#define _AVX512DQINTRIN_H_INCLUDED
+
+#ifndef __AVX512DQ__
+#pragma GCC push_options
+#pragma GCC target("avx512dq")
+#define __DISABLE_AVX512DQ__
+#endif /* __AVX512DQ__ */
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_ktest_mask8_u8  (__mmask8 __A,  __mmask8 __B, unsigned char *__CF)
+{
+  *__CF = (unsigned char) __builtin_ia32_ktestcqi (__A, __B);
+  return (unsigned char) __builtin_ia32_ktestzqi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_ktestz_mask8_u8 (__mmask8 __A, __mmask8 __B)
+{
+  return (unsigned char) __builtin_ia32_ktestzqi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_ktestc_mask8_u8 (__mmask8 __A, __mmask8 __B)
+{
+  return (unsigned char) __builtin_ia32_ktestcqi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_ktest_mask16_u8  (__mmask16 __A,  __mmask16 __B, unsigned char *__CF)
+{
+  *__CF = (unsigned char) __builtin_ia32_ktestchi (__A, __B);
+  return (unsigned char) __builtin_ia32_ktestzhi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_ktestz_mask16_u8 (__mmask16 __A, __mmask16 __B)
+{
+  return (unsigned char) __builtin_ia32_ktestzhi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_ktestc_mask16_u8 (__mmask16 __A, __mmask16 __B)
+{
+  return (unsigned char) __builtin_ia32_ktestchi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kortest_mask8_u8  (__mmask8 __A,  __mmask8 __B, unsigned char *__CF)
+{
+  *__CF = (unsigned char) __builtin_ia32_kortestcqi (__A, __B);
+  return (unsigned char) __builtin_ia32_kortestzqi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kortestz_mask8_u8 (__mmask8 __A, __mmask8 __B)
+{
+  return (unsigned char) __builtin_ia32_kortestzqi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kortestc_mask8_u8 (__mmask8 __A, __mmask8 __B)
+{
+  return (unsigned char) __builtin_ia32_kortestcqi (__A, __B);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kadd_mask8 (__mmask8 __A, __mmask8 __B)
+{
+  return (__mmask8) __builtin_ia32_kaddqi ((__mmask8) __A, (__mmask8) __B);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kadd_mask16 (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kaddhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+extern __inline unsigned int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_cvtmask8_u32 (__mmask8 __A)
+{
+  return (unsigned int) __builtin_ia32_kmovb ((__mmask8 ) __A);
+}
+       
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_cvtu32_mask8 (unsigned int __A)
+{
+  return (__mmask8) __builtin_ia32_kmovb ((__mmask8) __A);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_load_mask8 (__mmask8 *__A)
+{
+  return (__mmask8) __builtin_ia32_kmovb (*(__mmask8 *) __A);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_store_mask8 (__mmask8 *__A, __mmask8 __B)
+{
+  *(__mmask8 *) __A = __builtin_ia32_kmovb (__B);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_knot_mask8 (__mmask8 __A)
+{
+  return (__mmask8) __builtin_ia32_knotqi ((__mmask8) __A);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kor_mask8 (__mmask8 __A, __mmask8 __B)
+{
+  return (__mmask8) __builtin_ia32_korqi ((__mmask8) __A, (__mmask8) __B);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kxnor_mask8 (__mmask8 __A, __mmask8 __B)
+{
+  return (__mmask8) __builtin_ia32_kxnorqi ((__mmask8) __A, (__mmask8) __B);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kxor_mask8 (__mmask8 __A, __mmask8 __B)
+{
+  return (__mmask8) __builtin_ia32_kxorqi ((__mmask8) __A, (__mmask8) __B);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kand_mask8 (__mmask8 __A, __mmask8 __B)
+{
+  return (__mmask8) __builtin_ia32_kandqi ((__mmask8) __A, (__mmask8) __B);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kandn_mask8 (__mmask8 __A, __mmask8 __B)
+{
+  return (__mmask8) __builtin_ia32_kandnqi ((__mmask8) __A, (__mmask8) __B);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcast_f64x2 (__m128d __A)
+{
+  return (__m512d)
+        __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A,
+                                                _mm512_undefined_pd (),
+                                                (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcast_f64x2 (__m512d __O, __mmask8 __M, __m128d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df)
+                                                          __A,
+                                                          (__v8df)
+                                                          __O, __M);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df)
+                                                          __A,
+                                                          (__v8df)
+                                                          _mm512_setzero_ps (),
+                                                          __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcast_i64x2 (__m128i __A)
+{
+  return (__m512i)
+        __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A,
+                                                _mm512_undefined_epi32 (),
+                                                (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcast_i64x2 (__m512i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di)
+                                                          __A,
+                                                          (__v8di)
+                                                          __O, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di)
+                                                          __A,
+                                                          (__v8di)
+                                                          _mm512_setzero_si512 (),
+                                                          __M);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcast_f32x2 (__m128 __A)
+{
+  return (__m512)
+        __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A,
+                                                (__v16sf)_mm512_undefined_ps (),
+                                                (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcast_f32x2 (__m512 __O, __mmask16 __M, __m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A,
+                                                         (__v16sf)
+                                                         __O, __M);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A,
+                                                         (__v16sf)
+                                                         _mm512_setzero_ps (),
+                                                         __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcast_i32x2 (__m128i __A)
+{
+  return (__m512i)
+        __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A,
+                                                (__v16si)
+                                                _mm512_undefined_epi32 (),
+                                                (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcast_i32x2 (__m512i __O, __mmask16 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si)
+                                                          __A,
+                                                          (__v16si)
+                                                          __O, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si)
+                                                          __A,
+                                                          (__v16si)
+                                                          _mm512_setzero_si512 (),
+                                                          __M);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcast_f32x8 (__m256 __A)
+{
+  return (__m512)
+        __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,
+                                                _mm512_undefined_ps (),
+                                                (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcast_f32x8 (__m512 __O, __mmask16 __M, __m256 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,
+                                                         (__v16sf)__O,
+                                                         __M);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcast_f32x8 (__mmask16 __M, __m256 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A,
+                                                         (__v16sf)
+                                                         _mm512_setzero_ps (),
+                                                         __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcast_i32x8 (__m256i __A)
+{
+  return (__m512i)
+        __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A,
+                                                (__v16si)
+                                                _mm512_undefined_epi32 (),
+                                                (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcast_i32x8 (__m512i __O, __mmask16 __M, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si)
+                                                          __A,
+                                                          (__v16si)__O,
+                                                          __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcast_i32x8 (__mmask16 __M, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si)
+                                                          __A,
+                                                          (__v16si)
+                                                          _mm512_setzero_si512 (),
+                                                          __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mullo_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v8du) __A * (__v8du) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mullo_epi64 (__m512i __W, __mmask8 __U, __m512i __A,
+                        __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A,
+                                                 (__v8di) __B,
+                                                 (__v8di) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mullo_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A,
+                                                 (__v8di) __B,
+                                                 (__v8di)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_xor_pd (__m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df)
+                                                _mm512_setzero_pd (),
+                                                (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_xor_pd (__m512d __W, __mmask8 __U, __m512d __A,
+                   __m512d __B)
+{
+  return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_xor_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df)
+                                                _mm512_setzero_pd (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_xor_ps (__m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf)
+                                               _mm512_setzero_ps (),
+                                               (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_xor_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf) __W,
+                                               (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_xor_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf)
+                                               _mm512_setzero_ps (),
+                                               (__mmask16) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_or_pd (__m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A,
+                                               (__v8df) __B,
+                                               (__v8df)
+                                               _mm512_setzero_pd (),
+                                               (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_or_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A,
+                                               (__v8df) __B,
+                                               (__v8df) __W,
+                                               (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_or_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A,
+                                               (__v8df) __B,
+                                               (__v8df)
+                                               _mm512_setzero_pd (),
+                                               (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_or_ps (__m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A,
+                                              (__v16sf) __B,
+                                              (__v16sf)
+                                              _mm512_setzero_ps (),
+                                              (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_or_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A,
+                                              (__v16sf) __B,
+                                              (__v16sf) __W,
+                                              (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_or_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A,
+                                              (__v16sf) __B,
+                                              (__v16sf)
+                                              _mm512_setzero_ps (),
+                                              (__mmask16) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_and_pd (__m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df)
+                                                _mm512_setzero_pd (),
+                                                (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_and_pd (__m512d __W, __mmask8 __U, __m512d __A,
+                   __m512d __B)
+{
+  return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_and_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df)
+                                                _mm512_setzero_pd (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_and_ps (__m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf)
+                                               _mm512_setzero_ps (),
+                                               (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_and_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf) __W,
+                                               (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_and_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf)
+                                               _mm512_setzero_ps (),
+                                               (__mmask16) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_andnot_pd (__m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A,
+                                                 (__v8df) __B,
+                                                 (__v8df)
+                                                 _mm512_setzero_pd (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_andnot_pd (__m512d __W, __mmask8 __U, __m512d __A,
+                      __m512d __B)
+{
+  return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A,
+                                                 (__v8df) __B,
+                                                 (__v8df) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_andnot_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A,
+                                                 (__v8df) __B,
+                                                 (__v8df)
+                                                 _mm512_setzero_pd (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_andnot_ps (__m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A,
+                                                (__v16sf) __B,
+                                                (__v16sf)
+                                                _mm512_setzero_ps (),
+                                                (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_andnot_ps (__m512 __W, __mmask16 __U, __m512 __A,
+                      __m512 __B)
+{
+  return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A,
+                                                (__v16sf) __B,
+                                                (__v16sf) __W,
+                                                (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_andnot_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A,
+                                                (__v16sf) __B,
+                                                (__v16sf)
+                                                _mm512_setzero_ps (),
+                                                (__mmask16) __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_movepi32_mask (__m512i __A)
+{
+  return (__mmask16) __builtin_ia32_cvtd2mask512 ((__v16si) __A);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_movepi64_mask (__m512i __A)
+{
+  return (__mmask8) __builtin_ia32_cvtq2mask512 ((__v8di) __A);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_movm_epi32 (__mmask16 __A)
+{
+  return (__m512i) __builtin_ia32_cvtmask2d512 (__A);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_movm_epi64 (__mmask8 __A)
+{
+  return (__m512i) __builtin_ia32_cvtmask2q512 (__A);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttpd_epi64 (__m512d __A)
+{
+  return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
+                                                    (__v8di)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
+                                                    (__v8di) __W,
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttpd_epi64 (__mmask8 __U, __m512d __A)
+{
+  return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
+                                                    (__v8di)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttpd_epu64 (__m512d __A)
+{
+  return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
+                                                     (__v8di)
+                                                     _mm512_setzero_si512 (),
+                                                     (__mmask8) -1,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
+                                                     (__v8di) __W,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttpd_epu64 (__mmask8 __U, __m512d __A)
+{
+  return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
+                                                     (__v8di)
+                                                     _mm512_setzero_si512 (),
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttps_epi64 (__m256 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
+                                                    (__v8di)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttps_epi64 (__m512i __W, __mmask8 __U, __m256 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
+                                                    (__v8di) __W,
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttps_epi64 (__mmask8 __U, __m256 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
+                                                    (__v8di)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttps_epu64 (__m256 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
+                                                     (__v8di)
+                                                     _mm512_setzero_si512 (),
+                                                     (__mmask8) -1,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttps_epu64 (__m512i __W, __mmask8 __U, __m256 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
+                                                     (__v8di) __W,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttps_epu64 (__mmask8 __U, __m256 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
+                                                     (__v8di)
+                                                     _mm512_setzero_si512 (),
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtpd_epi64 (__m512d __A)
+{
+  return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
+                                                   (__v8di)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask8) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
+                                                   (__v8di) __W,
+                                                   (__mmask8) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtpd_epi64 (__mmask8 __U, __m512d __A)
+{
+  return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
+                                                   (__v8di)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask8) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtpd_epu64 (__m512d __A)
+{
+  return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
+                                                    (__v8di)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
+                                                    (__v8di) __W,
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtpd_epu64 (__mmask8 __U, __m512d __A)
+{
+  return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
+                                                    (__v8di)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtps_epi64 (__m256 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
+                                                   (__v8di)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask8) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtps_epi64 (__m512i __W, __mmask8 __U, __m256 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
+                                                   (__v8di) __W,
+                                                   (__mmask8) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtps_epi64 (__mmask8 __U, __m256 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
+                                                   (__v8di)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask8) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtps_epu64 (__m256 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
+                                                    (__v8di)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtps_epu64 (__m512i __W, __mmask8 __U, __m256 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
+                                                    (__v8di) __W,
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtps_epu64 (__mmask8 __U, __m256 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
+                                                    (__v8di)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi64_ps (__m512i __A)
+{
+  return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
+                                                  (__v8sf)
+                                                  _mm256_setzero_ps (),
+                                                  (__mmask8) -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi64_ps (__m256 __W, __mmask8 __U, __m512i __A)
+{
+  return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
+                                                  (__v8sf) __W,
+                                                  (__mmask8) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi64_ps (__mmask8 __U, __m512i __A)
+{
+  return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
+                                                  (__v8sf)
+                                                  _mm256_setzero_ps (),
+                                                  (__mmask8) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu64_ps (__m512i __A)
+{
+  return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
+                                                   (__v8sf)
+                                                   _mm256_setzero_ps (),
+                                                   (__mmask8) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu64_ps (__m256 __W, __mmask8 __U, __m512i __A)
+{
+  return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
+                                                   (__v8sf) __W,
+                                                   (__mmask8) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu64_ps (__mmask8 __U, __m512i __A)
+{
+  return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
+                                                   (__v8sf)
+                                                   _mm256_setzero_ps (),
+                                                   (__mmask8) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi64_pd (__m512i __A)
+{
+  return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,
+                                                   (__v8df)
+                                                   _mm512_setzero_pd (),
+                                                   (__mmask8) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi64_pd (__m512d __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,
+                                                   (__v8df) __W,
+                                                   (__mmask8) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi64_pd (__mmask8 __U, __m512i __A)
+{
+  return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,
+                                                   (__v8df)
+                                                   _mm512_setzero_pd (),
+                                                   (__mmask8) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu64_pd (__m512i __A)
+{
+  return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,
+                                                    (__v8df)
+                                                    _mm512_setzero_pd (),
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu64_pd (__m512d __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,
+                                                    (__v8df) __W,
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu64_pd (__mmask8 __U, __m512i __A)
+{
+  return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,
+                                                    (__v8df)
+                                                    _mm512_setzero_pd (),
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kshiftli_mask8 (__mmask8 __A, unsigned int __B)
+{
+  return (__mmask8) __builtin_ia32_kshiftliqi ((__mmask8) __A, (__mmask8) __B);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kshiftri_mask8 (__mmask8 __A, unsigned int __B)
+{
+  return (__mmask8) __builtin_ia32_kshiftriqi ((__mmask8) __A, (__mmask8) __B);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_range_pd (__m512d __A, __m512d __B, int __C)
+{
+  return (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A,
+                                                  (__v8df) __B, __C,
+                                                  (__v8df)
+                                                  _mm512_setzero_pd (),
+                                                  (__mmask8) -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_range_pd (__m512d __W, __mmask8 __U,
+                     __m512d __A, __m512d __B, int __C)
+{
+  return (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A,
+                                                  (__v8df) __B, __C,
+                                                  (__v8df) __W,
+                                                  (__mmask8) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_range_pd (__mmask8 __U, __m512d __A, __m512d __B, int __C)
+{
+  return (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A,
+                                                  (__v8df) __B, __C,
+                                                  (__v8df)
+                                                  _mm512_setzero_pd (),
+                                                  (__mmask8) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_range_ps (__m512 __A, __m512 __B, int __C)
+{
+  return (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A,
+                                                 (__v16sf) __B, __C,
+                                                 (__v16sf)
+                                                 _mm512_setzero_ps (),
+                                                 (__mmask16) -1,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_range_ps (__m512 __W, __mmask16 __U,
+                     __m512 __A, __m512 __B, int __C)
+{
+  return (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A,
+                                                 (__v16sf) __B, __C,
+                                                 (__v16sf) __W,
+                                                 (__mmask16) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_range_ps (__mmask16 __U, __m512 __A, __m512 __B, int __C)
+{
+  return (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A,
+                                                 (__v16sf) __B, __C,
+                                                 (__v16sf)
+                                                 _mm512_setzero_ps (),
+                                                 (__mmask16) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_sd (__m128d __A, __m128d __B, int __C)
+{
+  return (__m128d) __builtin_ia32_reducesd_mask ((__v2df) __A,
+                                                (__v2df) __B, __C,
+                                                (__v2df) _mm_setzero_pd (),
+                                                (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_round_sd (__m128d __A, __m128d __B, int __C, const int __R)
+{
+  return (__m128d) __builtin_ia32_reducesd_mask_round ((__v2df) __A,
+                                                      (__v2df) __B, __C,
+                                                      (__v2df)
+                                                      _mm_setzero_pd (),
+                                                      (__mmask8) -1, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_sd (__m128d __W,  __mmask8 __U, __m128d __A,
+                   __m128d __B, int __C)
+{
+  return (__m128d) __builtin_ia32_reducesd_mask ((__v2df) __A,
+                                                (__v2df) __B, __C,
+                                                (__v2df) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_round_sd (__m128d __W,  __mmask8 __U, __m128d __A,
+                         __m128d __B, int __C, const int __R)
+{
+  return (__m128d) __builtin_ia32_reducesd_mask_round ((__v2df) __A,
+                                                      (__v2df) __B, __C,
+                                                      (__v2df) __W,
+                                                      __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_reduce_sd (__mmask8 __U, __m128d __A, __m128d __B, int __C)
+{
+  return (__m128d) __builtin_ia32_reducesd_mask ((__v2df) __A,
+                                                (__v2df) __B, __C,
+                                                (__v2df) _mm_setzero_pd (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_reduce_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
+                          int __C, const int __R)
+{
+  return (__m128d) __builtin_ia32_reducesd_mask_round ((__v2df) __A,
+                                                      (__v2df) __B, __C,
+                                                      (__v2df)
+                                                      _mm_setzero_pd (),
+                                                      __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_ss (__m128 __A, __m128 __B, int __C)
+{
+  return (__m128) __builtin_ia32_reducess_mask ((__v4sf) __A,
+                                               (__v4sf) __B, __C,
+                                               (__v4sf) _mm_setzero_ps (),
+                                               (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_round_ss (__m128 __A, __m128 __B, int __C, const int __R)
+{
+  return (__m128) __builtin_ia32_reducess_mask_round ((__v4sf) __A,
+                                                     (__v4sf) __B, __C,
+                                                     (__v4sf)
+                                                     _mm_setzero_ps (),
+                                                     (__mmask8) -1, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_ss (__m128 __W,  __mmask8 __U, __m128 __A,
+                   __m128 __B, int __C)
+{
+  return (__m128) __builtin_ia32_reducess_mask ((__v4sf) __A,
+                                               (__v4sf) __B, __C,
+                                               (__v4sf) __W,
+                                               (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_round_ss (__m128 __W,  __mmask8 __U, __m128 __A,
+                         __m128 __B, int __C, const int __R)
+{
+  return (__m128) __builtin_ia32_reducess_mask_round ((__v4sf) __A,
+                                                     (__v4sf) __B, __C,
+                                                     (__v4sf) __W,
+                                                     __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_reduce_ss (__mmask8 __U, __m128 __A, __m128 __B, int __C)
+{
+  return (__m128) __builtin_ia32_reducess_mask ((__v4sf) __A,
+                                               (__v4sf) __B, __C,
+                                               (__v4sf) _mm_setzero_ps (),
+                                               (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_reduce_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
+                          int __C, const int __R)
+{
+  return (__m128) __builtin_ia32_reducess_mask_round ((__v4sf) __A,
+                                                     (__v4sf) __B, __C,
+                                                     (__v4sf)
+                                                     _mm_setzero_ps (),
+                                                     __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_range_sd (__m128d __A, __m128d __B, int __C)
+{
+  return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A,
+                                                  (__v2df) __B, __C,
+                                                  (__v2df)
+                                                  _mm_setzero_pd (),
+                                                  (__mmask8) -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_range_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, int __C)
+{
+  return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A,
+                                                  (__v2df) __B, __C,
+                                                  (__v2df) __W,
+                                                  (__mmask8) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_range_sd (__mmask8 __U, __m128d __A, __m128d __B, int __C)
+{
+  return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A,
+                                                  (__v2df) __B, __C,
+                                                  (__v2df)
+                                                  _mm_setzero_pd (),
+                                                  (__mmask8) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_range_ss (__m128 __A, __m128 __B, int __C)
+{
+  return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A,
+                                                 (__v4sf) __B, __C,
+                                                 (__v4sf)
+                                                 _mm_setzero_ps (),
+                                                 (__mmask8) -1,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_range_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, int __C)
+{
+  return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A,
+                                                 (__v4sf) __B, __C,
+                                                 (__v4sf) __W,
+                                                 (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_range_ss (__mmask8 __U, __m128 __A, __m128 __B, int __C)
+{
+  return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A,
+                                                 (__v4sf) __B, __C,
+                                                 (__v4sf)
+                                                 _mm_setzero_ps (),
+                                                 (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_range_round_sd (__m128d __A, __m128d __B, int __C, const int __R)
+{
+  return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A,
+                                                  (__v2df) __B, __C,
+                                                  (__v2df)
+                                                  _mm_setzero_pd (),
+                                                  (__mmask8) -1, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_range_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B,
+                        int __C, const int __R)
+{
+  return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A,
+                                                  (__v2df) __B, __C,
+                                                  (__v2df) __W,
+                                                  (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_range_round_sd (__mmask8 __U, __m128d __A, __m128d __B, int __C,
+                         const int __R)
+{
+  return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A,
+                                                  (__v2df) __B, __C,
+                                                  (__v2df)
+                                                  _mm_setzero_pd (),
+                                                  (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_range_round_ss (__m128 __A, __m128 __B, int __C, const int __R)
+{
+  return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A,
+                                                 (__v4sf) __B, __C,
+                                                 (__v4sf)
+                                                 _mm_setzero_ps (),
+                                                 (__mmask8) -1, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_range_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B,
+                        int __C, const int __R)
+{
+  return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A,
+                                                 (__v4sf) __B, __C,
+                                                 (__v4sf) __W,
+                                                 (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_range_round_ss (__mmask8 __U, __m128 __A, __m128 __B, int __C,
+                         const int __R)
+{
+  return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A,
+                                                 (__v4sf) __B, __C,
+                                                 (__v4sf)
+                                                 _mm_setzero_ps (),
+                                                 (__mmask8) __U, __R);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fpclass_ss_mask (__m128 __A, const int __imm)
+{
+  return (__mmask8) __builtin_ia32_fpclassss_mask ((__v4sf) __A, __imm,
+                                                  (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fpclass_sd_mask (__m128d __A, const int __imm)
+{
+  return (__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) __A, __imm,
+                                                  (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fpclass_ss_mask (__mmask8 __U, __m128 __A, const int __imm)
+{
+  return (__mmask8) __builtin_ia32_fpclassss_mask ((__v4sf) __A, __imm, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fpclass_sd_mask (__mmask8 __U, __m128d __A, const int __imm)
+{
+  return (__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) __A, __imm, __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundpd_epi64 (__m512d __A, const int __R)
+{
+  return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
+                                                    (__v8di)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask8) -1,
+                                                    __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A,
+                               const int __R)
+{
+  return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
+                                                    (__v8di) __W,
+                                                    (__mmask8) __U,
+                                                    __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundpd_epi64 (__mmask8 __U, __m512d __A,
+                                const int __R)
+{
+  return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A,
+                                                    (__v8di)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask8) __U,
+                                                    __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundpd_epu64 (__m512d __A, const int __R)
+{
+  return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
+                                                     (__v8di)
+                                                     _mm512_setzero_si512 (),
+                                                     (__mmask8) -1,
+                                                     __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A,
+                               const int __R)
+{
+  return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
+                                                     (__v8di) __W,
+                                                     (__mmask8) __U,
+                                                     __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundpd_epu64 (__mmask8 __U, __m512d __A,
+                                const int __R)
+{
+  return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A,
+                                                     (__v8di)
+                                                     _mm512_setzero_si512 (),
+                                                     (__mmask8) __U,
+                                                     __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundps_epi64 (__m256 __A, const int __R)
+{
+  return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
+                                                    (__v8di)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask8) -1,
+                                                    __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundps_epi64 (__m512i __W, __mmask8 __U, __m256 __A,
+                               const int __R)
+{
+  return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
+                                                    (__v8di) __W,
+                                                    (__mmask8) __U,
+                                                    __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundps_epi64 (__mmask8 __U, __m256 __A,
+                                const int __R)
+{
+  return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A,
+                                                    (__v8di)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask8) __U,
+                                                    __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundps_epu64 (__m256 __A, const int __R)
+{
+  return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
+                                                     (__v8di)
+                                                     _mm512_setzero_si512 (),
+                                                     (__mmask8) -1,
+                                                     __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundps_epu64 (__m512i __W, __mmask8 __U, __m256 __A,
+                               const int __R)
+{
+  return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
+                                                     (__v8di) __W,
+                                                     (__mmask8) __U,
+                                                     __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundps_epu64 (__mmask8 __U, __m256 __A,
+                                const int __R)
+{
+  return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A,
+                                                     (__v8di)
+                                                     _mm512_setzero_si512 (),
+                                                     (__mmask8) __U,
+                                                     __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundpd_epi64 (__m512d __A, const int __R)
+{
+  return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
+                                                   (__v8di)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask8) -1,
+                                                   __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A,
+                              const int __R)
+{
+  return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
+                                                   (__v8di) __W,
+                                                   (__mmask8) __U,
+                                                   __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundpd_epi64 (__mmask8 __U, __m512d __A,
+                               const int __R)
+{
+  return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A,
+                                                   (__v8di)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask8) __U,
+                                                   __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundpd_epu64 (__m512d __A, const int __R)
+{
+  return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
+                                                    (__v8di)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask8) -1,
+                                                    __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A,
+                              const int __R)
+{
+  return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
+                                                    (__v8di) __W,
+                                                    (__mmask8) __U,
+                                                    __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundpd_epu64 (__mmask8 __U, __m512d __A,
+                               const int __R)
+{
+  return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A,
+                                                    (__v8di)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask8) __U,
+                                                    __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundps_epi64 (__m256 __A, const int __R)
+{
+  return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
+                                                   (__v8di)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask8) -1,
+                                                   __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundps_epi64 (__m512i __W, __mmask8 __U, __m256 __A,
+                              const int __R)
+{
+  return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
+                                                   (__v8di) __W,
+                                                   (__mmask8) __U,
+                                                   __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundps_epi64 (__mmask8 __U, __m256 __A,
+                               const int __R)
+{
+  return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A,
+                                                   (__v8di)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask8) __U,
+                                                   __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundps_epu64 (__m256 __A, const int __R)
+{
+  return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
+                                                    (__v8di)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask8) -1,
+                                                    __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundps_epu64 (__m512i __W, __mmask8 __U, __m256 __A,
+                              const int __R)
+{
+  return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
+                                                    (__v8di) __W,
+                                                    (__mmask8) __U,
+                                                    __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundps_epu64 (__mmask8 __U, __m256 __A,
+                               const int __R)
+{
+  return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A,
+                                                    (__v8di)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask8) __U,
+                                                    __R);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepi64_ps (__m512i __A, const int __R)
+{
+  return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
+                                                  (__v8sf)
+                                                  _mm256_setzero_ps (),
+                                                  (__mmask8) -1,
+                                                  __R);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepi64_ps (__m256 __W, __mmask8 __U, __m512i __A,
+                              const int __R)
+{
+  return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
+                                                  (__v8sf) __W,
+                                                  (__mmask8) __U,
+                                                  __R);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepi64_ps (__mmask8 __U, __m512i __A,
+                               const int __R)
+{
+  return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A,
+                                                  (__v8sf)
+                                                  _mm256_setzero_ps (),
+                                                  (__mmask8) __U,
+                                                  __R);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepu64_ps (__m512i __A, const int __R)
+{
+  return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
+                                                   (__v8sf)
+                                                   _mm256_setzero_ps (),
+                                                   (__mmask8) -1,
+                                                   __R);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepu64_ps (__m256 __W, __mmask8 __U, __m512i __A,
+                              const int __R)
+{
+  return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
+                                                   (__v8sf) __W,
+                                                   (__mmask8) __U,
+                                                   __R);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepu64_ps (__mmask8 __U, __m512i __A,
+                               const int __R)
+{
+  return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A,
+                                                   (__v8sf)
+                                                   _mm256_setzero_ps (),
+                                                   (__mmask8) __U,
+                                                   __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepi64_pd (__m512i __A, const int __R)
+{
+  return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,
+                                                   (__v8df)
+                                                   _mm512_setzero_pd (),
+                                                   (__mmask8) -1,
+                                                   __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepi64_pd (__m512d __W, __mmask8 __U, __m512i __A,
+                              const int __R)
+{
+  return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,
+                                                   (__v8df) __W,
+                                                   (__mmask8) __U,
+                                                   __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepi64_pd (__mmask8 __U, __m512i __A,
+                               const int __R)
+{
+  return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A,
+                                                   (__v8df)
+                                                   _mm512_setzero_pd (),
+                                                   (__mmask8) __U,
+                                                   __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepu64_pd (__m512i __A, const int __R)
+{
+  return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,
+                                                    (__v8df)
+                                                    _mm512_setzero_pd (),
+                                                    (__mmask8) -1,
+                                                    __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepu64_pd (__m512d __W, __mmask8 __U, __m512i __A,
+                              const int __R)
+{
+  return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,
+                                                    (__v8df) __W,
+                                                    (__mmask8) __U,
+                                                    __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepu64_pd (__mmask8 __U, __m512i __A,
+                               const int __R)
+{
+  return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A,
+                                                    (__v8df)
+                                                    _mm512_setzero_pd (),
+                                                    (__mmask8) __U,
+                                                    __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_pd (__m512d __A, int __B)
+{
+  return (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B,
+                                                   (__v8df)
+                                                   _mm512_setzero_pd (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_round_pd (__m512d __A, int __B, const int __R)
+{
+  return (__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df) __A,
+                                                         __B,
+                                                         (__v8df)
+                                                         _mm512_setzero_pd (),
+                                                         (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_pd (__m512d __W, __mmask8 __U, __m512d __A, int __B)
+{
+  return (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B,
+                                                   (__v8df) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
+                            int __B, const int __R)
+{
+  return (__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df) __A,
+                                                         __B,
+                                                         (__v8df) __W,
+                                                         __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_reduce_pd (__mmask8 __U, __m512d __A, int __B)
+{
+  return (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B,
+                                                   (__v8df)
+                                                   _mm512_setzero_pd (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_reduce_round_pd (__mmask8 __U, __m512d __A, int __B,
+                             const int __R)
+{
+  return (__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df) __A,
+                                                         __B,
+                                                         (__v8df)
+                                                         _mm512_setzero_pd (),
+                                                         __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_ps (__m512 __A, int __B)
+{
+  return (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B,
+                                                  (__v16sf)
+                                                  _mm512_setzero_ps (),
+                                                  (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_round_ps (__m512 __A, int __B, const int __R)
+{
+  return (__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf) __A,
+                                                        __B,
+                                                        (__v16sf)
+                                                        _mm512_setzero_ps (),
+                                                        (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_ps (__m512 __W, __mmask16 __U, __m512 __A, int __B)
+{
+  return (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B,
+                                                  (__v16sf) __W,
+                                                  (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_round_ps (__m512 __W, __mmask16 __U, __m512 __A, int __B,
+                            const int __R)
+{
+  return (__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf) __A,
+                                                        __B,
+                                                        (__v16sf) __W,
+                                                        __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_reduce_ps (__mmask16 __U, __m512 __A, int __B)
+{
+  return (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B,
+                                                  (__v16sf)
+                                                  _mm512_setzero_ps (),
+                                                  (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_reduce_round_ps (__mmask16 __U, __m512 __A, int __B,
+                             const int __R)
+{
+  return (__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf) __A,
+                                                        __B,
+                                                        (__v16sf)
+                                                        _mm512_setzero_ps (),
+                                                        __U, __R);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_extractf32x8_ps (__m512 __A, const int __imm)
+{
+  return (__m256) __builtin_ia32_extractf32x8_mask ((__v16sf) __A,
+                                                   __imm,
+                                                   (__v8sf)
+                                                   _mm256_setzero_ps (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_extractf32x8_ps (__m256 __W, __mmask8 __U, __m512 __A,
+                            const int __imm)
+{
+  return (__m256) __builtin_ia32_extractf32x8_mask ((__v16sf) __A,
+                                                   __imm,
+                                                   (__v8sf) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_extractf32x8_ps (__mmask8 __U, __m512 __A,
+                             const int __imm)
+{
+  return (__m256) __builtin_ia32_extractf32x8_mask ((__v16sf) __A,
+                                                   __imm,
+                                                   (__v8sf)
+                                                   _mm256_setzero_ps (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_extractf64x2_pd (__m512d __A, const int __imm)
+{
+  return (__m128d) __builtin_ia32_extractf64x2_512_mask ((__v8df) __A,
+                                                        __imm,
+                                                        (__v2df)
+                                                        _mm_setzero_pd (),
+                                                        (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_extractf64x2_pd (__m128d __W, __mmask8 __U, __m512d __A,
+                            const int __imm)
+{
+  return (__m128d) __builtin_ia32_extractf64x2_512_mask ((__v8df) __A,
+                                                        __imm,
+                                                        (__v2df) __W,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_extractf64x2_pd (__mmask8 __U, __m512d __A,
+                             const int __imm)
+{
+  return (__m128d) __builtin_ia32_extractf64x2_512_mask ((__v8df) __A,
+                                                        __imm,
+                                                        (__v2df)
+                                                        _mm_setzero_pd (),
+                                                        (__mmask8)
+                                                        __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_extracti32x8_epi32 (__m512i __A, const int __imm)
+{
+  return (__m256i) __builtin_ia32_extracti32x8_mask ((__v16si) __A,
+                                                    __imm,
+                                                    (__v8si)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_extracti32x8_epi32 (__m256i __W, __mmask8 __U, __m512i __A,
+                               const int __imm)
+{
+  return (__m256i) __builtin_ia32_extracti32x8_mask ((__v16si) __A,
+                                                    __imm,
+                                                    (__v8si) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_extracti32x8_epi32 (__mmask8 __U, __m512i __A,
+                                const int __imm)
+{
+  return (__m256i) __builtin_ia32_extracti32x8_mask ((__v16si) __A,
+                                                    __imm,
+                                                    (__v8si)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_extracti64x2_epi64 (__m512i __A, const int __imm)
+{
+  return (__m128i) __builtin_ia32_extracti64x2_512_mask ((__v8di) __A,
+                                                        __imm,
+                                                        (__v2di)
+                                                        _mm_setzero_si128 (),
+                                                        (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_extracti64x2_epi64 (__m128i __W, __mmask8 __U, __m512i __A,
+                               const int __imm)
+{
+  return (__m128i) __builtin_ia32_extracti64x2_512_mask ((__v8di) __A,
+                                                        __imm,
+                                                        (__v2di) __W,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_extracti64x2_epi64 (__mmask8 __U, __m512i __A,
+                                const int __imm)
+{
+  return (__m128i) __builtin_ia32_extracti64x2_512_mask ((__v8di) __A,
+                                                        __imm,
+                                                        (__v2di)
+                                                        _mm_setzero_si128 (),
+                                                        (__mmask8)
+                                                        __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_range_round_pd (__m512d __A, __m512d __B, int __C,
+                      const int __R)
+{
+  return (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A,
+                                                  (__v8df) __B, __C,
+                                                  (__v8df)
+                                                  _mm512_setzero_pd (),
+                                                  (__mmask8) -1,
+                                                  __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_range_round_pd (__m512d __W, __mmask8 __U,
+                           __m512d __A, __m512d __B, int __C,
+                           const int __R)
+{
+  return (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A,
+                                                  (__v8df) __B, __C,
+                                                  (__v8df) __W,
+                                                  (__mmask8) __U,
+                                                  __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_range_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+                            int __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A,
+                                                  (__v8df) __B, __C,
+                                                  (__v8df)
+                                                  _mm512_setzero_pd (),
+                                                  (__mmask8) __U,
+                                                  __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_range_round_ps (__m512 __A, __m512 __B, int __C, const int __R)
+{
+  return (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A,
+                                                 (__v16sf) __B, __C,
+                                                 (__v16sf)
+                                                 _mm512_setzero_ps (),
+                                                 (__mmask16) -1,
+                                                 __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_range_round_ps (__m512 __W, __mmask16 __U,
+                           __m512 __A, __m512 __B, int __C,
+                           const int __R)
+{
+  return (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A,
+                                                 (__v16sf) __B, __C,
+                                                 (__v16sf) __W,
+                                                 (__mmask16) __U,
+                                                 __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_range_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
+                            int __C, const int __R)
+{
+  return (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A,
+                                                 (__v16sf) __B, __C,
+                                                 (__v16sf)
+                                                 _mm512_setzero_ps (),
+                                                 (__mmask16) __U,
+                                                 __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_inserti32x8 (__m512i __A, __m256i __B, const int __imm)
+{
+  return (__m512i) __builtin_ia32_inserti32x8_mask ((__v16si) __A,
+                                                   (__v8si) __B,
+                                                   __imm,
+                                                   (__v16si)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_inserti32x8 (__m512i __W, __mmask16 __U, __m512i __A,
+                        __m256i __B, const int __imm)
+{
+  return (__m512i) __builtin_ia32_inserti32x8_mask ((__v16si) __A,
+                                                   (__v8si) __B,
+                                                   __imm,
+                                                   (__v16si) __W,
+                                                   (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_inserti32x8 (__mmask16 __U, __m512i __A, __m256i __B,
+                         const int __imm)
+{
+  return (__m512i) __builtin_ia32_inserti32x8_mask ((__v16si) __A,
+                                                   (__v8si) __B,
+                                                   __imm,
+                                                   (__v16si)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_insertf32x8 (__m512 __A, __m256 __B, const int __imm)
+{
+  return (__m512) __builtin_ia32_insertf32x8_mask ((__v16sf) __A,
+                                                  (__v8sf) __B,
+                                                  __imm,
+                                                  (__v16sf)
+                                                  _mm512_setzero_ps (),
+                                                  (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_insertf32x8 (__m512 __W, __mmask16 __U, __m512 __A,
+                        __m256 __B, const int __imm)
+{
+  return (__m512) __builtin_ia32_insertf32x8_mask ((__v16sf) __A,
+                                                  (__v8sf) __B,
+                                                  __imm,
+                                                  (__v16sf) __W,
+                                                  (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_insertf32x8 (__mmask16 __U, __m512 __A, __m256 __B,
+                         const int __imm)
+{
+  return (__m512) __builtin_ia32_insertf32x8_mask ((__v16sf) __A,
+                                                  (__v8sf) __B,
+                                                  __imm,
+                                                  (__v16sf)
+                                                  _mm512_setzero_ps (),
+                                                  (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_inserti64x2 (__m512i __A, __m128i __B, const int __imm)
+{
+  return (__m512i) __builtin_ia32_inserti64x2_512_mask ((__v8di) __A,
+                                                       (__v2di) __B,
+                                                       __imm,
+                                                       (__v8di)
+                                                       _mm512_setzero_si512 (),
+                                                       (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_inserti64x2 (__m512i __W, __mmask8 __U, __m512i __A,
+                        __m128i __B, const int __imm)
+{
+  return (__m512i) __builtin_ia32_inserti64x2_512_mask ((__v8di) __A,
+                                                       (__v2di) __B,
+                                                       __imm,
+                                                       (__v8di) __W,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_inserti64x2 (__mmask8 __U, __m512i __A, __m128i __B,
+                         const int __imm)
+{
+  return (__m512i) __builtin_ia32_inserti64x2_512_mask ((__v8di) __A,
+                                                       (__v2di) __B,
+                                                       __imm,
+                                                       (__v8di)
+                                                       _mm512_setzero_si512 (),
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_insertf64x2 (__m512d __A, __m128d __B, const int __imm)
+{
+  return (__m512d) __builtin_ia32_insertf64x2_512_mask ((__v8df) __A,
+                                                       (__v2df) __B,
+                                                       __imm,
+                                                       (__v8df)
+                                                       _mm512_setzero_pd (),
+                                                       (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_insertf64x2 (__m512d __W, __mmask8 __U, __m512d __A,
+                        __m128d __B, const int __imm)
+{
+  return (__m512d) __builtin_ia32_insertf64x2_512_mask ((__v8df) __A,
+                                                       (__v2df) __B,
+                                                       __imm,
+                                                       (__v8df) __W,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_insertf64x2 (__mmask8 __U, __m512d __A, __m128d __B,
+                         const int __imm)
+{
+  return (__m512d) __builtin_ia32_insertf64x2_512_mask ((__v8df) __A,
+                                                       (__v2df) __B,
+                                                       __imm,
+                                                       (__v8df)
+                                                       _mm512_setzero_pd (),
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fpclass_pd_mask (__mmask8 __U, __m512d __A,
+                            const int __imm)
+{
+  return (__mmask8) __builtin_ia32_fpclasspd512_mask ((__v8df) __A,
+                                                     __imm, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fpclass_pd_mask (__m512d __A, const int __imm)
+{
+  return (__mmask8) __builtin_ia32_fpclasspd512_mask ((__v8df) __A,
+                                                     __imm,
+                                                     (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fpclass_ps_mask (__mmask16 __U, __m512 __A,
+                            const int __imm)
+{
+  return (__mmask16) __builtin_ia32_fpclassps512_mask ((__v16sf) __A,
+                                                      __imm, __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fpclass_ps_mask (__m512 __A, const int __imm)
+{
+  return (__mmask16) __builtin_ia32_fpclassps512_mask ((__v16sf) __A,
+                                                      __imm,
+                                                      (__mmask16) -1);
+}
+
+#else
+#define _kshiftli_mask8(X, Y)                                          \
+  ((__mmask8) __builtin_ia32_kshiftliqi ((__mmask8)(X), (__mmask8)(Y)))
+
+#define _kshiftri_mask8(X, Y)                                          \
+  ((__mmask8) __builtin_ia32_kshiftriqi ((__mmask8)(X), (__mmask8)(Y)))
+
+#define _mm_range_sd(A, B, C)                                           \
+  ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), \
+    (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (),         \
+    (__mmask8) -1, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_range_sd(W, U, A, B, C)                                \
+  ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), \
+    (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W),               \
+    (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_range_sd(U, A, B, C)                                  \
+  ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), \
+    (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (),         \
+    (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_range_ss(A, B, C)                                          \
+  ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), \
+    (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (),         \
+    (__mmask8) -1, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_range_ss(W, U, A, B, C)                               \
+  ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), \
+    (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W),                        \
+    (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_range_ss(U, A, B, C)                                 \
+  ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), \
+    (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (),         \
+    (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_range_round_sd(A, B, C, R)                                  \
+  ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), \
+    (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (),                 \
+    (__mmask8) -1, (R)))
+
+#define _mm_mask_range_round_sd(W, U, A, B, C, R)                       \
+  ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), \
+    (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W),               \
+    (__mmask8)(U), (R)))
+
+#define _mm_maskz_range_round_sd(U, A, B, C, R)                                 \
+  ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), \
+    (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (),                 \
+    (__mmask8)(U), (R)))
+
+#define _mm_range_round_ss(A, B, C, R)                                 \
+  ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), \
+    (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (),         \
+    (__mmask8) -1, (R)))
+
+#define _mm_mask_range_round_ss(W, U, A, B, C, R)                      \
+  ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), \
+    (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W),                        \
+    (__mmask8)(U), (R)))
+
+#define _mm_maskz_range_round_ss(U, A, B, C, R)                                \
+  ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), \
+    (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (),         \
+    (__mmask8)(U), (R)))
+
+#define _mm512_cvtt_roundpd_epi64(A, B)                    \
+  ((__m512i)__builtin_ia32_cvttpd2qq512_mask ((A), (__v8di)            \
+                                             _mm512_setzero_si512 (),  \
+                                             -1, (B)))
+
+#define _mm512_mask_cvtt_roundpd_epi64(W, U, A, B)  \
+    ((__m512i)__builtin_ia32_cvttpd2qq512_mask ((A), (__v8di)(W), (U), (B)))
+
+#define _mm512_maskz_cvtt_roundpd_epi64(U, A, B)    \
+    ((__m512i)__builtin_ia32_cvttpd2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B)))
+
+#define _mm512_cvtt_roundpd_epu64(A, B)                    \
+    ((__m512i)__builtin_ia32_cvttpd2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B)))
+
+#define _mm512_mask_cvtt_roundpd_epu64(W, U, A, B)  \
+    ((__m512i)__builtin_ia32_cvttpd2uqq512_mask ((A), (__v8di)(W), (U), (B)))
+
+#define _mm512_maskz_cvtt_roundpd_epu64(U, A, B)    \
+    ((__m512i)__builtin_ia32_cvttpd2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B)))
+
+#define _mm512_cvtt_roundps_epi64(A, B)                    \
+    ((__m512i)__builtin_ia32_cvttps2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B)))
+
+#define _mm512_mask_cvtt_roundps_epi64(W, U, A, B)  \
+    ((__m512i)__builtin_ia32_cvttps2qq512_mask ((A), (__v8di)(W), (U), (B)))
+
+#define _mm512_maskz_cvtt_roundps_epi64(U, A, B)    \
+    ((__m512i)__builtin_ia32_cvttps2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B)))
+
+#define _mm512_cvtt_roundps_epu64(A, B)                    \
+    ((__m512i)__builtin_ia32_cvttps2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B)))
+
+#define _mm512_mask_cvtt_roundps_epu64(W, U, A, B)  \
+    ((__m512i)__builtin_ia32_cvttps2uqq512_mask ((A), (__v8di)(W), (U), (B)))
+
+#define _mm512_maskz_cvtt_roundps_epu64(U, A, B)    \
+    ((__m512i)__builtin_ia32_cvttps2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B)))
+
+#define _mm512_cvt_roundpd_epi64(A, B)             \
+    ((__m512i)__builtin_ia32_cvtpd2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B)))
+
+#define _mm512_mask_cvt_roundpd_epi64(W, U, A, B)   \
+    ((__m512i)__builtin_ia32_cvtpd2qq512_mask ((A), (__v8di)(W), (U), (B)))
+
+#define _mm512_maskz_cvt_roundpd_epi64(U, A, B)     \
+    ((__m512i)__builtin_ia32_cvtpd2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B)))
+
+#define _mm512_cvt_roundpd_epu64(A, B)             \
+    ((__m512i)__builtin_ia32_cvtpd2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B)))
+
+#define _mm512_mask_cvt_roundpd_epu64(W, U, A, B)   \
+    ((__m512i)__builtin_ia32_cvtpd2uqq512_mask ((A), (__v8di)(W), (U), (B)))
+
+#define _mm512_maskz_cvt_roundpd_epu64(U, A, B)     \
+    ((__m512i)__builtin_ia32_cvtpd2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B)))
+
+#define _mm512_cvt_roundps_epi64(A, B)             \
+    ((__m512i)__builtin_ia32_cvtps2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B)))
+
+#define _mm512_mask_cvt_roundps_epi64(W, U, A, B)   \
+    ((__m512i)__builtin_ia32_cvtps2qq512_mask ((A), (__v8di)(W), (U), (B)))
+
+#define _mm512_maskz_cvt_roundps_epi64(U, A, B)     \
+    ((__m512i)__builtin_ia32_cvtps2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B)))
+
+#define _mm512_cvt_roundps_epu64(A, B)             \
+    ((__m512i)__builtin_ia32_cvtps2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B)))
+
+#define _mm512_mask_cvt_roundps_epu64(W, U, A, B)   \
+    ((__m512i)__builtin_ia32_cvtps2uqq512_mask ((A), (__v8di)(W), (U), (B)))
+
+#define _mm512_maskz_cvt_roundps_epu64(U, A, B)     \
+    ((__m512i)__builtin_ia32_cvtps2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B)))
+
+#define _mm512_cvt_roundepi64_ps(A, B)             \
+    ((__m256)__builtin_ia32_cvtqq2ps512_mask ((__v8di)(A), (__v8sf)_mm256_setzero_ps (), -1, (B)))
+
+#define _mm512_mask_cvt_roundepi64_ps(W, U, A, B)   \
+    ((__m256)__builtin_ia32_cvtqq2ps512_mask ((__v8di)(A), (W), (U), (B)))
+
+#define _mm512_maskz_cvt_roundepi64_ps(U, A, B)     \
+    ((__m256)__builtin_ia32_cvtqq2ps512_mask ((__v8di)(A), (__v8sf)_mm256_setzero_ps (), (U), (B)))
+
+#define _mm512_cvt_roundepu64_ps(A, B)             \
+    ((__m256)__builtin_ia32_cvtuqq2ps512_mask ((__v8di)(A), (__v8sf)_mm256_setzero_ps (), -1, (B)))
+
+#define _mm512_mask_cvt_roundepu64_ps(W, U, A, B)   \
+    ((__m256)__builtin_ia32_cvtuqq2ps512_mask ((__v8di)(A), (W), (U), (B)))
+
+#define _mm512_maskz_cvt_roundepu64_ps(U, A, B)     \
+    ((__m256)__builtin_ia32_cvtuqq2ps512_mask ((__v8di)(A), (__v8sf)_mm256_setzero_ps (), (U), (B)))
+
+#define _mm512_cvt_roundepi64_pd(A, B)             \
+    ((__m512d)__builtin_ia32_cvtqq2pd512_mask ((__v8di)(A), (__v8df)_mm512_setzero_pd (), -1, (B)))
+
+#define _mm512_mask_cvt_roundepi64_pd(W, U, A, B)   \
+    ((__m512d)__builtin_ia32_cvtqq2pd512_mask ((__v8di)(A), (W), (U), (B)))
+
+#define _mm512_maskz_cvt_roundepi64_pd(U, A, B)     \
+    ((__m512d)__builtin_ia32_cvtqq2pd512_mask ((__v8di)(A), (__v8df)_mm512_setzero_pd (), (U), (B)))
+
+#define _mm512_cvt_roundepu64_pd(A, B)             \
+    ((__m512d)__builtin_ia32_cvtuqq2pd512_mask ((__v8di)(A), (__v8df)_mm512_setzero_pd (), -1, (B)))
+
+#define _mm512_mask_cvt_roundepu64_pd(W, U, A, B)   \
+    ((__m512d)__builtin_ia32_cvtuqq2pd512_mask ((__v8di)(A), (W), (U), (B)))
+
+#define _mm512_maskz_cvt_roundepu64_pd(U, A, B)     \
+    ((__m512d)__builtin_ia32_cvtuqq2pd512_mask ((__v8di)(A), (__v8df)_mm512_setzero_pd (), (U), (B)))
+
+#define _mm512_reduce_pd(A, B)                                         \
+  ((__m512d) __builtin_ia32_reducepd512_mask ((__v8df)(__m512d)(A),    \
+    (int)(B), (__v8df)_mm512_setzero_pd (), (__mmask8)-1))
+
+#define _mm512_reduce_round_pd(A, B, R)                                         \
+  ((__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df)(__m512d)(A),\
+    (int)(B), (__v8df)_mm512_setzero_pd (), (__mmask8)-1, (R)))
+
+#define _mm512_mask_reduce_pd(W, U, A, B)                              \
+  ((__m512d) __builtin_ia32_reducepd512_mask ((__v8df)(__m512d)(A),    \
+    (int)(B), (__v8df)(__m512d)(W), (__mmask8)(U)))
+
+#define _mm512_mask_reduce_round_pd(W, U, A, B, R)                      \
+  ((__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df)(__m512d)(A),\
+    (int)(B), (__v8df)(__m512d)(W), (U), (R)))
+
+#define _mm512_maskz_reduce_pd(U, A, B)                                        \
+  ((__m512d) __builtin_ia32_reducepd512_mask ((__v8df)(__m512d)(A),    \
+    (int)(B), (__v8df)_mm512_setzero_pd (), (__mmask8)(U)))
+
+#define _mm512_maskz_reduce_round_pd(U, A, B, R)                        \
+  ((__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df)(__m512d)(A),\
+    (int)(B), (__v8df)_mm512_setzero_pd (), (U), (R)))
+
+#define _mm512_reduce_ps(A, B)                                         \
+  ((__m512) __builtin_ia32_reduceps512_mask ((__v16sf)(__m512)(A),     \
+    (int)(B), (__v16sf)_mm512_setzero_ps (), (__mmask16)-1))
+
+#define _mm512_reduce_round_ps(A, B, R)                                        \
+  ((__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf)(__m512)(A),\
+    (int)(B), (__v16sf)_mm512_setzero_ps (), (__mmask16)-1, (R)))
+
+#define _mm512_mask_reduce_ps(W, U, A, B)                              \
+  ((__m512) __builtin_ia32_reduceps512_mask ((__v16sf)(__m512)(A),     \
+    (int)(B), (__v16sf)(__m512)(W), (__mmask16)(U)))
+
+#define _mm512_mask_reduce_round_ps(W, U, A, B, R)                     \
+  ((__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf)(__m512)(A),\
+    (int)(B), (__v16sf)(__m512)(W), (U), (R)))
+
+#define _mm512_maskz_reduce_ps(U, A, B)                                        \
+  ((__m512) __builtin_ia32_reduceps512_mask ((__v16sf)(__m512)(A),     \
+    (int)(B), (__v16sf)_mm512_setzero_ps (), (__mmask16)(U)))
+
+#define _mm512_maskz_reduce_round_ps(U, A, B, R)                       \
+  ((__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf)(__m512)(A),\
+    (int)(B), (__v16sf)_mm512_setzero_ps (), (__mmask16)(U), (R)))
+
+#define _mm512_extractf32x8_ps(X, C)                                    \
+  ((__m256) __builtin_ia32_extractf32x8_mask ((__v16sf)(__m512) (X),    \
+    (int) (C), (__v8sf)(__m256) _mm256_setzero_ps (), (__mmask8)-1))
+
+#define _mm512_mask_extractf32x8_ps(W, U, X, C)                         \
+  ((__m256) __builtin_ia32_extractf32x8_mask ((__v16sf)(__m512) (X),    \
+    (int) (C), (__v8sf)(__m256) (W), (__mmask8) (U)))
+
+#define _mm512_maskz_extractf32x8_ps(U, X, C)                           \
+  ((__m256) __builtin_ia32_extractf32x8_mask ((__v16sf)(__m512) (X),    \
+    (int) (C), (__v8sf)(__m256) _mm256_setzero_ps (), (__mmask8) (U)))
+
+#define _mm512_extractf64x2_pd(X, C)                                    \
+  ((__m128d) __builtin_ia32_extractf64x2_512_mask ((__v8df)(__m512d) (X),\
+    (int) (C), (__v2df)(__m128d) _mm_setzero_pd (), (__mmask8)-1))
+
+#define _mm512_mask_extractf64x2_pd(W, U, X, C)                         \
+  ((__m128d) __builtin_ia32_extractf64x2_512_mask ((__v8df)(__m512d) (X),\
+    (int) (C), (__v2df)(__m128d) (W), (__mmask8) (U)))
+
+#define _mm512_maskz_extractf64x2_pd(U, X, C)                           \
+  ((__m128d) __builtin_ia32_extractf64x2_512_mask ((__v8df)(__m512d) (X),\
+    (int) (C), (__v2df)(__m128d) _mm_setzero_pd (), (__mmask8) (U)))
+
+#define _mm512_extracti32x8_epi32(X, C)                                 \
+  ((__m256i) __builtin_ia32_extracti32x8_mask ((__v16si)(__m512i) (X),  \
+    (int) (C), (__v8si)(__m256i) _mm256_setzero_si256 (), (__mmask8)-1))
+
+#define _mm512_mask_extracti32x8_epi32(W, U, X, C)                      \
+  ((__m256i) __builtin_ia32_extracti32x8_mask ((__v16si)(__m512i) (X),  \
+    (int) (C), (__v8si)(__m256i) (W), (__mmask8) (U)))
+
+#define _mm512_maskz_extracti32x8_epi32(U, X, C)                        \
+  ((__m256i) __builtin_ia32_extracti32x8_mask ((__v16si)(__m512i) (X),  \
+    (int) (C), (__v8si)(__m256i) _mm256_setzero_si256 (), (__mmask8) (U)))
+
+#define _mm512_extracti64x2_epi64(X, C)                                 \
+  ((__m128i) __builtin_ia32_extracti64x2_512_mask ((__v8di)(__m512i) (X),\
+    (int) (C), (__v2di)(__m128i) _mm_setzero_si128 (), (__mmask8)-1))
+
+#define _mm512_mask_extracti64x2_epi64(W, U, X, C)                      \
+  ((__m128i) __builtin_ia32_extracti64x2_512_mask ((__v8di)(__m512i) (X),\
+    (int) (C), (__v2di)(__m128i) (W), (__mmask8) (U)))
+
+#define _mm512_maskz_extracti64x2_epi64(U, X, C)                        \
+  ((__m128i) __builtin_ia32_extracti64x2_512_mask ((__v8di)(__m512i) (X),\
+    (int) (C), (__v2di)(__m128i) _mm_setzero_si128 (), (__mmask8) (U)))
+
+#define _mm512_range_pd(A, B, C)                                       \
+  ((__m512d) __builtin_ia32_rangepd512_mask ((__v8df)(__m512d)(A),     \
+    (__v8df)(__m512d)(B), (int)(C),                                    \
+    (__v8df)_mm512_setzero_pd (), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_range_pd(W, U, A, B, C)                            \
+  ((__m512d) __builtin_ia32_rangepd512_mask ((__v8df)(__m512d)(A),     \
+    (__v8df)(__m512d)(B), (int)(C),                                    \
+    (__v8df)(__m512d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_range_pd(U, A, B, C)                              \
+  ((__m512d) __builtin_ia32_rangepd512_mask ((__v8df)(__m512d)(A),     \
+    (__v8df)(__m512d)(B), (int)(C),                                    \
+    (__v8df)_mm512_setzero_pd (), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_range_ps(A, B, C)                                       \
+  ((__m512) __builtin_ia32_rangeps512_mask ((__v16sf)(__m512)(A),      \
+    (__v16sf)(__m512)(B), (int)(C),                                    \
+    (__v16sf)_mm512_setzero_ps (), (__mmask16)-1, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_range_ps(W, U, A, B, C)                            \
+  ((__m512) __builtin_ia32_rangeps512_mask ((__v16sf)(__m512)(A),      \
+    (__v16sf)(__m512)(B), (int)(C),                                    \
+    (__v16sf)(__m512)(W), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_range_ps(U, A, B, C)                              \
+  ((__m512) __builtin_ia32_rangeps512_mask ((__v16sf)(__m512)(A),      \
+    (__v16sf)(__m512)(B), (int)(C),                                    \
+    (__v16sf)_mm512_setzero_ps (), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_range_round_pd(A, B, C, R)                                      \
+  ((__m512d) __builtin_ia32_rangepd512_mask ((__v8df)(__m512d)(A),     \
+    (__v8df)(__m512d)(B), (int)(C),                                    \
+    (__v8df)_mm512_setzero_pd (), (__mmask8)-1, (R)))
+
+#define _mm512_mask_range_round_pd(W, U, A, B, C, R)                           \
+  ((__m512d) __builtin_ia32_rangepd512_mask ((__v8df)(__m512d)(A),     \
+    (__v8df)(__m512d)(B), (int)(C),                                    \
+    (__v8df)(__m512d)(W), (__mmask8)(U), (R)))
+
+#define _mm512_maskz_range_round_pd(U, A, B, C, R)                             \
+  ((__m512d) __builtin_ia32_rangepd512_mask ((__v8df)(__m512d)(A),     \
+    (__v8df)(__m512d)(B), (int)(C),                                    \
+    (__v8df)_mm512_setzero_pd (), (__mmask8)(U), (R)))
+
+#define _mm512_range_round_ps(A, B, C, R)                                      \
+  ((__m512) __builtin_ia32_rangeps512_mask ((__v16sf)(__m512)(A),      \
+    (__v16sf)(__m512)(B), (int)(C),                                    \
+    (__v16sf)_mm512_setzero_ps (), (__mmask16)-1, (R)))
+
+#define _mm512_mask_range_round_ps(W, U, A, B, C, R)                           \
+  ((__m512) __builtin_ia32_rangeps512_mask ((__v16sf)(__m512)(A),      \
+    (__v16sf)(__m512)(B), (int)(C),                                    \
+    (__v16sf)(__m512)(W), (__mmask16)(U), (R)))
+
+#define _mm512_maskz_range_round_ps(U, A, B, C, R)                             \
+  ((__m512) __builtin_ia32_rangeps512_mask ((__v16sf)(__m512)(A),      \
+    (__v16sf)(__m512)(B), (int)(C),                                    \
+    (__v16sf)_mm512_setzero_ps (), (__mmask16)(U), (R)))
+
+#define _mm512_insertf64x2(X, Y, C)                                     \
+  ((__m512d) __builtin_ia32_insertf64x2_512_mask ((__v8df)(__m512d) (X),\
+    (__v2df)(__m128d) (Y), (int) (C), (__v8df)(__m512d) (X),            \
+    (__mmask8)-1))
+
+#define _mm512_mask_insertf64x2(W, U, X, Y, C)                          \
+  ((__m512d) __builtin_ia32_insertf64x2_512_mask ((__v8df)(__m512d) (X),\
+    (__v2df)(__m128d) (Y), (int) (C), (__v8df)(__m512d) (W),            \
+    (__mmask8) (U)))
+
+#define _mm512_maskz_insertf64x2(U, X, Y, C)                            \
+  ((__m512d) __builtin_ia32_insertf64x2_512_mask ((__v8df)(__m512d) (X),\
+    (__v2df)(__m128d) (Y), (int) (C),                                   \
+    (__v8df)(__m512d) _mm512_setzero_pd (), (__mmask8) (U)))
+
+#define _mm512_inserti64x2(X, Y, C)                                     \
+  ((__m512i) __builtin_ia32_inserti64x2_512_mask ((__v8di)(__m512i) (X),\
+    (__v2di)(__m128i) (Y), (int) (C), (__v8di)(__m512i) (X), (__mmask8)-1))
+
+#define _mm512_mask_inserti64x2(W, U, X, Y, C)                          \
+  ((__m512i) __builtin_ia32_inserti64x2_512_mask ((__v8di)(__m512i) (X),\
+    (__v2di)(__m128i) (Y), (int) (C), (__v8di)(__m512i) (W),            \
+    (__mmask8) (U)))
+
+#define _mm512_maskz_inserti64x2(U, X, Y, C)                            \
+  ((__m512i) __builtin_ia32_inserti64x2_512_mask ((__v8di)(__m512i) (X),\
+    (__v2di)(__m128i) (Y), (int) (C),                                   \
+    (__v8di)(__m512i) _mm512_setzero_si512 (), (__mmask8) (U)))
+
+#define _mm512_insertf32x8(X, Y, C)                                     \
+  ((__m512) __builtin_ia32_insertf32x8_mask ((__v16sf)(__m512) (X),     \
+    (__v8sf)(__m256) (Y), (int) (C),\
+    (__v16sf)(__m512)_mm512_setzero_ps (),\
+    (__mmask16)-1))
+
+#define _mm512_mask_insertf32x8(W, U, X, Y, C)                          \
+  ((__m512) __builtin_ia32_insertf32x8_mask ((__v16sf)(__m512) (X),     \
+    (__v8sf)(__m256) (Y), (int) (C),\
+    (__v16sf)(__m512)(W),\
+    (__mmask16)(U)))
+
+#define _mm512_maskz_insertf32x8(U, X, Y, C)                            \
+  ((__m512) __builtin_ia32_insertf32x8_mask ((__v16sf)(__m512) (X),     \
+    (__v8sf)(__m256) (Y), (int) (C),\
+    (__v16sf)(__m512)_mm512_setzero_ps (),\
+    (__mmask16)(U)))
+
+#define _mm512_inserti32x8(X, Y, C)                                     \
+  ((__m512i) __builtin_ia32_inserti32x8_mask ((__v16si)(__m512i) (X),   \
+    (__v8si)(__m256i) (Y), (int) (C),\
+    (__v16si)(__m512i)_mm512_setzero_si512 (),\
+    (__mmask16)-1))
+
+#define _mm512_mask_inserti32x8(W, U, X, Y, C)                          \
+  ((__m512i) __builtin_ia32_inserti32x8_mask ((__v16si)(__m512i) (X),   \
+    (__v8si)(__m256i) (Y), (int) (C),\
+    (__v16si)(__m512i)(W),\
+    (__mmask16)(U)))
+
+#define _mm512_maskz_inserti32x8(U, X, Y, C)                            \
+  ((__m512i) __builtin_ia32_inserti32x8_mask ((__v16si)(__m512i) (X),   \
+    (__v8si)(__m256i) (Y), (int) (C),\
+    (__v16si)(__m512i)_mm512_setzero_si512 (),\
+    (__mmask16)(U)))
+
+#define _mm_fpclass_ss_mask(X, C)                                      \
+  ((__mmask8) __builtin_ia32_fpclassss_mask ((__v4sf) (__m128) (X),    \
+                                            (int) (C), (__mmask8) (-1))) \
+
+#define _mm_fpclass_sd_mask(X, C)                                      \
+  ((__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) (__m128d) (X),   \
+                                            (int) (C), (__mmask8) (-1))) \
+
+#define _mm_mask_fpclass_ss_mask(X, C, U)                              \
+  ((__mmask8) __builtin_ia32_fpclassss_mask ((__v4sf) (__m128) (X),    \
+                                            (int) (C), (__mmask8) (U)))
+
+#define _mm_mask_fpclass_sd_mask(X, C, U)                              \
+  ((__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) (__m128d) (X),   \
+                                            (int) (C), (__mmask8) (U)))
+
+#define _mm512_mask_fpclass_pd_mask(u, X, C)                            \
+  ((__mmask8) __builtin_ia32_fpclasspd512_mask ((__v8df) (__m512d) (X), \
+                                               (int) (C), (__mmask8)(u)))
+
+#define _mm512_mask_fpclass_ps_mask(u, x, c)                           \
+  ((__mmask16) __builtin_ia32_fpclassps512_mask ((__v16sf) (__m512) (x),\
+                                                (int) (c),(__mmask16)(u)))
+
+#define _mm512_fpclass_pd_mask(X, C)                                    \
+  ((__mmask8) __builtin_ia32_fpclasspd512_mask ((__v8df) (__m512d) (X), \
+                                               (int) (C), (__mmask8)-1))
+
+#define _mm512_fpclass_ps_mask(x, c)                                    \
+  ((__mmask16) __builtin_ia32_fpclassps512_mask ((__v16sf) (__m512) (x),\
+                                                (int) (c),(__mmask16)-1))
+
+#define _mm_reduce_sd(A, B, C)                                         \
+  ((__m128d) __builtin_ia32_reducesd_mask ((__v2df)(__m128d)(A),       \
+    (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (),                \
+    (__mmask8)-1))
+
+#define _mm_mask_reduce_sd(W, U, A, B, C)                              \
+  ((__m128d) __builtin_ia32_reducesd_mask ((__v2df)(__m128d)(A),       \
+    (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W), (__mmask8)(U)))
+
+#define _mm_maskz_reduce_sd(U, A, B, C)                                        \
+  ((__m128d) __builtin_ia32_reducesd_mask ((__v2df)(__m128d)(A),       \
+    (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (),                \
+    (__mmask8)(U)))
+
+#define _mm_reduce_round_sd(A, B, C, R)                                       \
+  ((__m128d) __builtin_ia32_reducesd_round ((__v2df)(__m128d)(A),      \
+    (__v2df)(__m128d)(B), (int)(C), (__mmask8)(U), (int)(R)))
+
+#define _mm_mask_reduce_round_sd(W, U, A, B, C, R)                    \
+  ((__m128d) __builtin_ia32_reducesd_mask_round ((__v2df)(__m128d)(A), \
+    (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W),             \
+    (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_reduce_round_sd(U, A, B, C, R)                      \
+  ((__m128d) __builtin_ia32_reducesd_mask_round ((__v2df)(__m128d)(A), \
+    (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (),               \
+    (__mmask8)(U), (int)(R)))
+
+#define _mm_reduce_ss(A, B, C)                                         \
+  ((__m128) __builtin_ia32_reducess_mask ((__v4sf)(__m128)(A),         \
+    (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (),         \
+    (__mmask8)-1))
+
+#define _mm_mask_reduce_ss(W, U, A, B, C)                              \
+  ((__m128) __builtin_ia32_reducess_mask ((__v4sf)(__m128)(A),         \
+    (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), (__mmask8)(U)))
+
+#define _mm_maskz_reduce_ss(U, A, B, C)                                        \
+  ((__m128) __builtin_ia32_reducess_mask ((__v4sf)(__m128)(A),         \
+    (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (),         \
+    (__mmask8)(U)))
+
+#define _mm_reduce_round_ss(A, B, C, R)                                       \
+  ((__m128) __builtin_ia32_reducess_round ((__v4sf)(__m128)(A),               \
+    (__v4sf)(__m128)(B), (int)(C), (__mmask8)(U), (int)(R)))
+
+#define _mm_mask_reduce_round_ss(W, U, A, B, C, R)                    \
+  ((__m128) __builtin_ia32_reducess_mask_round ((__v4sf)(__m128)(A),   \
+    (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W),                       \
+    (__mmask8)(U), (int)(R)))
+
+#define _mm_maskz_reduce_round_ss(U, A, B, C, R)                      \
+  ((__m128) __builtin_ia32_reducesd_mask_round ((__v4sf)(__m128)(A),   \
+    (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (),        \
+    (__mmask8)(U), (int)(R)))
+
+
+#endif
+
+#ifdef __DISABLE_AVX512DQ__
+#undef __DISABLE_AVX512DQ__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512DQ__ */
+
+#endif /* _AVX512DQINTRIN_H_INCLUDED */
diff --git a/include-gcc/avx512erintrin.h b/include-gcc/avx512erintrin.h
new file mode 100644 (file)
index 0000000..bd83b7f
--- /dev/null
@@ -0,0 +1,536 @@
+/* Copyright (C) 2013-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512erintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512ERINTRIN_H_INCLUDED
+#define _AVX512ERINTRIN_H_INCLUDED
+
+#ifndef __AVX512ER__
+#pragma GCC push_options
+#pragma GCC target("avx512er")
+#define __DISABLE_AVX512ER__
+#endif /* __AVX512ER__ */
+
+/* Internal data types for implementing the intrinsics.  */
+typedef double __v8df __attribute__ ((__vector_size__ (64)));
+typedef float __v16sf __attribute__ ((__vector_size__ (64)));
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+   vector types, and their scalar components.  */
+typedef float __m512 __attribute__ ((__vector_size__ (64), __may_alias__));
+typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__));
+
+typedef unsigned char  __mmask8;
+typedef unsigned short __mmask16;
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_exp2a23_round_pd (__m512d __A, int __R)
+{
+  return (__m512d) __builtin_ia32_exp2pd_mask ((__v8df) __A,
+                                              (__v8df) _mm512_undefined_pd (),
+                                              (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_exp2a23_round_pd (__m512d __W, __mmask8 __U, __m512d __A, int __R)
+{
+  return (__m512d) __builtin_ia32_exp2pd_mask ((__v8df) __A,
+                                              (__v8df) __W,
+                                              (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_exp2a23_round_pd (__mmask8 __U, __m512d __A, int __R)
+{
+  return (__m512d) __builtin_ia32_exp2pd_mask ((__v8df) __A,
+                                              (__v8df) _mm512_setzero_pd (),
+                                              (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_exp2a23_round_ps (__m512 __A, int __R)
+{
+  return (__m512) __builtin_ia32_exp2ps_mask ((__v16sf) __A,
+                                             (__v16sf) _mm512_undefined_ps (),
+                                             (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_exp2a23_round_ps (__m512 __W, __mmask16 __U, __m512 __A, int __R)
+{
+  return (__m512) __builtin_ia32_exp2ps_mask ((__v16sf) __A,
+                                             (__v16sf) __W,
+                                             (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_exp2a23_round_ps (__mmask16 __U, __m512 __A, int __R)
+{
+  return (__m512) __builtin_ia32_exp2ps_mask ((__v16sf) __A,
+                                             (__v16sf) _mm512_setzero_ps (),
+                                             (__mmask16) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rcp28_round_pd (__m512d __A, int __R)
+{
+  return (__m512d) __builtin_ia32_rcp28pd_mask ((__v8df) __A,
+                                               (__v8df) _mm512_undefined_pd (),
+                                               (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rcp28_round_pd (__m512d __W, __mmask8 __U, __m512d __A, int __R)
+{
+  return (__m512d) __builtin_ia32_rcp28pd_mask ((__v8df) __A,
+                                               (__v8df) __W,
+                                               (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rcp28_round_pd (__mmask8 __U, __m512d __A, int __R)
+{
+  return (__m512d) __builtin_ia32_rcp28pd_mask ((__v8df) __A,
+                                               (__v8df) _mm512_setzero_pd (),
+                                               (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rcp28_round_ps (__m512 __A, int __R)
+{
+  return (__m512) __builtin_ia32_rcp28ps_mask ((__v16sf) __A,
+                                              (__v16sf) _mm512_undefined_ps (),
+                                              (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rcp28_round_ps (__m512 __W, __mmask16 __U, __m512 __A, int __R)
+{
+  return (__m512) __builtin_ia32_rcp28ps_mask ((__v16sf) __A,
+                                              (__v16sf) __W,
+                                              (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rcp28_round_ps (__mmask16 __U, __m512 __A, int __R)
+{
+  return (__m512) __builtin_ia32_rcp28ps_mask ((__v16sf) __A,
+                                              (__v16sf) _mm512_setzero_ps (),
+                                              (__mmask16) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp28_round_sd (__m128d __A, __m128d __B, int __R)
+{
+  return (__m128d) __builtin_ia32_rcp28sd_round ((__v2df) __B,
+                                                (__v2df) __A,
+                                                __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rcp28_round_sd (__m128d __W, __mmask8 __U, __m128d __A,
+                        __m128d __B, int __R)
+{
+  return (__m128d) __builtin_ia32_rcp28sd_mask_round ((__v2df) __B,
+                                                     (__v2df) __A,
+                                                     (__v2df) __W,
+                                                     __U,
+                                                     __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rcp28_round_sd (__mmask8 __U, __m128d __A, __m128d __B, int __R)
+{
+  return (__m128d) __builtin_ia32_rcp28sd_mask_round ((__v2df) __B,
+                                                     (__v2df) __A,
+                                                     (__v2df)
+                                                     _mm_setzero_pd (),
+                                                     __U,
+                                                     __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp28_round_ss (__m128 __A, __m128 __B, int __R)
+{
+  return (__m128) __builtin_ia32_rcp28ss_round ((__v4sf) __B,
+                                               (__v4sf) __A,
+                                               __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rcp28_round_ss (__m128 __W, __mmask8 __U, __m128 __A,
+                        __m128 __B, int __R)
+{
+  return (__m128) __builtin_ia32_rcp28ss_mask_round ((__v4sf) __B,
+                                                    (__v4sf) __A,
+                                                    (__v4sf) __W,
+                                                    __U,
+                                                    __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rcp28_round_ss (__mmask8 __U, __m128 __A, __m128 __B, int __R)
+{
+  return (__m128) __builtin_ia32_rcp28ss_mask_round ((__v4sf) __B,
+                                                    (__v4sf) __A,
+                                                    (__v4sf)
+                                                    _mm_setzero_ps (),
+                                                    __U,
+                                                    __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rsqrt28_round_pd (__m512d __A, int __R)
+{
+  return (__m512d) __builtin_ia32_rsqrt28pd_mask ((__v8df) __A,
+                                                 (__v8df) _mm512_undefined_pd (),
+                                                 (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rsqrt28_round_pd (__m512d __W, __mmask8 __U, __m512d __A, int __R)
+{
+  return (__m512d) __builtin_ia32_rsqrt28pd_mask ((__v8df) __A,
+                                                 (__v8df) __W,
+                                                 (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rsqrt28_round_pd (__mmask8 __U, __m512d __A, int __R)
+{
+  return (__m512d) __builtin_ia32_rsqrt28pd_mask ((__v8df) __A,
+                                                 (__v8df) _mm512_setzero_pd (),
+                                                 (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rsqrt28_round_ps (__m512 __A, int __R)
+{
+  return (__m512) __builtin_ia32_rsqrt28ps_mask ((__v16sf) __A,
+                                                (__v16sf) _mm512_undefined_ps (),
+                                                (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rsqrt28_round_ps (__m512 __W, __mmask16 __U, __m512 __A, int __R)
+{
+  return (__m512) __builtin_ia32_rsqrt28ps_mask ((__v16sf) __A,
+                                                (__v16sf) __W,
+                                                (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rsqrt28_round_ps (__mmask16 __U, __m512 __A, int __R)
+{
+  return (__m512) __builtin_ia32_rsqrt28ps_mask ((__v16sf) __A,
+                                                (__v16sf) _mm512_setzero_ps (),
+                                                (__mmask16) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt28_round_sd (__m128d __A, __m128d __B, int __R)
+{
+  return (__m128d) __builtin_ia32_rsqrt28sd_round ((__v2df) __B,
+                                                  (__v2df) __A,
+                                                  __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rsqrt28_round_sd (__m128d __W, __mmask8 __U, __m128d __A,
+                          __m128d __B, int __R)
+{
+  return (__m128d) __builtin_ia32_rsqrt28sd_mask_round ((__v2df) __B,
+                                                       (__v2df) __A,
+                                                       (__v2df) __W,
+                                                       __U,
+                                                       __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rsqrt28_round_sd (__mmask8 __U, __m128d __A, __m128d __B, int __R)
+{
+  return (__m128d) __builtin_ia32_rsqrt28sd_mask_round ((__v2df) __B,
+                                                       (__v2df) __A,
+                                                       (__v2df)
+                                                       _mm_setzero_pd (),
+                                                       __U,
+                                                       __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt28_round_ss (__m128 __A, __m128 __B, int __R)
+{
+  return (__m128) __builtin_ia32_rsqrt28ss_round ((__v4sf) __B,
+                                                 (__v4sf) __A,
+                                                 __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rsqrt28_round_ss (__m128 __W, __mmask8 __U, __m128 __A,
+                          __m128 __B, int __R)
+{
+  return (__m128) __builtin_ia32_rsqrt28ss_mask_round ((__v4sf) __B,
+                                                      (__v4sf) __A,
+                                                      (__v4sf) __W,
+                                                      __U,
+                                                      __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rsqrt28_round_ss (__mmask8 __U, __m128 __A, __m128 __B, int __R)
+{
+  return (__m128) __builtin_ia32_rsqrt28ss_mask_round ((__v4sf) __B,
+                                                      (__v4sf) __A,
+                                                      (__v4sf)
+                                                      _mm_setzero_ps (),
+                                                      __U,
+                                                      __R);
+}
+
+#else
+#define _mm512_exp2a23_round_pd(A, C)            \
+    __builtin_ia32_exp2pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C)
+
+#define _mm512_mask_exp2a23_round_pd(W, U, A, C) \
+    __builtin_ia32_exp2pd_mask(A, W, U, C)
+
+#define _mm512_maskz_exp2a23_round_pd(U, A, C)   \
+    __builtin_ia32_exp2pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C)
+
+#define _mm512_exp2a23_round_ps(A, C)            \
+    __builtin_ia32_exp2ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C)
+
+#define _mm512_mask_exp2a23_round_ps(W, U, A, C) \
+    __builtin_ia32_exp2ps_mask(A, W, U, C)
+
+#define _mm512_maskz_exp2a23_round_ps(U, A, C)   \
+    __builtin_ia32_exp2ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C)
+
+#define _mm512_rcp28_round_pd(A, C)            \
+    __builtin_ia32_rcp28pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C)
+
+#define _mm512_mask_rcp28_round_pd(W, U, A, C) \
+    __builtin_ia32_rcp28pd_mask(A, W, U, C)
+
+#define _mm512_maskz_rcp28_round_pd(U, A, C)   \
+    __builtin_ia32_rcp28pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C)
+
+#define _mm512_rcp28_round_ps(A, C)            \
+    __builtin_ia32_rcp28ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C)
+
+#define _mm512_mask_rcp28_round_ps(W, U, A, C) \
+    __builtin_ia32_rcp28ps_mask(A, W, U, C)
+
+#define _mm512_maskz_rcp28_round_ps(U, A, C)   \
+    __builtin_ia32_rcp28ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C)
+
+#define _mm512_rsqrt28_round_pd(A, C)            \
+    __builtin_ia32_rsqrt28pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C)
+
+#define _mm512_mask_rsqrt28_round_pd(W, U, A, C) \
+    __builtin_ia32_rsqrt28pd_mask(A, W, U, C)
+
+#define _mm512_maskz_rsqrt28_round_pd(U, A, C)   \
+    __builtin_ia32_rsqrt28pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C)
+
+#define _mm512_rsqrt28_round_ps(A, C)            \
+    __builtin_ia32_rsqrt28ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C)
+
+#define _mm512_mask_rsqrt28_round_ps(W, U, A, C) \
+    __builtin_ia32_rsqrt28ps_mask(A, W, U, C)
+
+#define _mm512_maskz_rsqrt28_round_ps(U, A, C)   \
+    __builtin_ia32_rsqrt28ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C)
+
+#define _mm_rcp28_round_sd(A, B, R)    \
+    __builtin_ia32_rcp28sd_round(A, B, R)
+
+#define _mm_mask_rcp28_round_sd(W, U, A, B, R) \
+    __builtin_ia32_rcp28sd_mask_round ((A), (B), (W), (U), (R))
+
+#define _mm_maskz_rcp28_round_sd(U, A, B, R)   \
+    __builtin_ia32_rcp28sd_mask_round ((A), (B), (__v2df) _mm_setzero_pd (), \
+                                      (U), (R))
+
+#define _mm_rcp28_round_ss(A, B, R)    \
+    __builtin_ia32_rcp28ss_round(A, B, R)
+
+#define _mm_mask_rcp28_round_ss(W, U, A, B, R) \
+    __builtin_ia32_rcp28ss_mask_round ((A), (B), (W), (U), (R))
+
+#define _mm_maskz_rcp28_round_ss(U, A, B, R)   \
+    __builtin_ia32_rcp28ss_mask_round ((A), (B), (__v4sf) _mm_setzero_ps (), \
+                                      (U), (R))
+
+#define _mm_rsqrt28_round_sd(A, B, R)  \
+    __builtin_ia32_rsqrt28sd_round(A, B, R)
+
+#define _mm_mask_rsqrt28_round_sd(W, U, A, B, R)       \
+    __builtin_ia32_rsqrt28sd_mask_round ((A), (B), (W), (U), (R))
+
+#define _mm_maskz_rsqrt28_round_sd(U, A, B, R) \
+    __builtin_ia32_rsqrt28sd_mask_round ((A), (B), (__v2df) _mm_setzero_pd (),\
+                                        (U), (R))
+
+#define _mm_rsqrt28_round_ss(A, B, R)  \
+    __builtin_ia32_rsqrt28ss_round(A, B, R)
+
+#define _mm_mask_rsqrt28_round_ss(W, U, A, B, R)       \
+    __builtin_ia32_rsqrt28ss_mask_round ((A), (B), (W), (U), (R))
+
+#define _mm_maskz_rsqrt28_round_ss(U, A, B, R) \
+    __builtin_ia32_rsqrt28ss_mask_round ((A), (B), (__v4sf) _mm_setzero_ps (),\
+                                        (U), (R))
+
+#endif
+
+#define _mm_mask_rcp28_sd(W, U, A, B)\
+    _mm_mask_rcp28_round_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rcp28_sd(U, A, B)\
+    _mm_maskz_rcp28_round_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rcp28_ss(W, U, A, B)\
+    _mm_mask_rcp28_round_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rcp28_ss(U, A, B)\
+    _mm_maskz_rcp28_round_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rsqrt28_sd(W, U, A, B)\
+    _mm_mask_rsqrt28_round_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rsqrt28_sd(U, A, B)\
+    _mm_maskz_rsqrt28_round_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_rsqrt28_ss(W, U, A, B)\
+    _mm_mask_rsqrt28_round_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_rsqrt28_ss(U, A, B)\
+    _mm_maskz_rsqrt28_round_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_exp2a23_pd(A)                    \
+    _mm512_exp2a23_round_pd(A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_exp2a23_pd(W, U, A)   \
+    _mm512_mask_exp2a23_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_exp2a23_pd(U, A)     \
+    _mm512_maskz_exp2a23_round_pd(U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_exp2a23_ps(A)                    \
+    _mm512_exp2a23_round_ps(A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_exp2a23_ps(W, U, A)   \
+    _mm512_mask_exp2a23_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_exp2a23_ps(U, A)     \
+    _mm512_maskz_exp2a23_round_ps(U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_rcp28_pd(A)                    \
+    _mm512_rcp28_round_pd(A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rcp28_pd(W, U, A)   \
+    _mm512_mask_rcp28_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rcp28_pd(U, A)     \
+    _mm512_maskz_rcp28_round_pd(U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_rcp28_ps(A)                    \
+    _mm512_rcp28_round_ps(A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rcp28_ps(W, U, A)   \
+    _mm512_mask_rcp28_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rcp28_ps(U, A)     \
+    _mm512_maskz_rcp28_round_ps(U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_rsqrt28_pd(A)                    \
+    _mm512_rsqrt28_round_pd(A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rsqrt28_pd(W, U, A)   \
+    _mm512_mask_rsqrt28_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rsqrt28_pd(U, A)     \
+    _mm512_maskz_rsqrt28_round_pd(U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_rsqrt28_ps(A)                    \
+    _mm512_rsqrt28_round_ps(A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_mask_rsqrt28_ps(W, U, A)   \
+    _mm512_mask_rsqrt28_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_maskz_rsqrt28_ps(U, A)     \
+    _mm512_maskz_rsqrt28_round_ps(U, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rcp28_sd(A, B)     \
+    __builtin_ia32_rcp28sd_round(B, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rcp28_ss(A, B)     \
+    __builtin_ia32_rcp28ss_round(B, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rsqrt28_sd(A, B)   \
+    __builtin_ia32_rsqrt28sd_round(B, A, _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_rsqrt28_ss(A, B)   \
+    __builtin_ia32_rsqrt28ss_round(B, A, _MM_FROUND_CUR_DIRECTION)
+
+#ifdef __DISABLE_AVX512ER__
+#undef __DISABLE_AVX512ER__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512ER__ */
+
+#endif /* _AVX512ERINTRIN_H_INCLUDED */
diff --git a/include-gcc/avx512fintrin.h b/include-gcc/avx512fintrin.h
new file mode 100644 (file)
index 0000000..89b3219
--- /dev/null
@@ -0,0 +1,16483 @@
+/* Copyright (C) 2013-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512fintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512FINTRIN_H_INCLUDED
+#define _AVX512FINTRIN_H_INCLUDED
+
+#ifndef __AVX512F__
+#pragma GCC push_options
+#pragma GCC target("avx512f")
+#define __DISABLE_AVX512F__
+#endif /* __AVX512F__ */
+
+/* Internal data types for implementing the intrinsics.  */
+typedef double __v8df __attribute__ ((__vector_size__ (64)));
+typedef float __v16sf __attribute__ ((__vector_size__ (64)));
+typedef long long __v8di __attribute__ ((__vector_size__ (64)));
+typedef unsigned long long __v8du __attribute__ ((__vector_size__ (64)));
+typedef int __v16si __attribute__ ((__vector_size__ (64)));
+typedef unsigned int __v16su __attribute__ ((__vector_size__ (64)));
+typedef short __v32hi __attribute__ ((__vector_size__ (64)));
+typedef unsigned short __v32hu __attribute__ ((__vector_size__ (64)));
+typedef char __v64qi __attribute__ ((__vector_size__ (64)));
+typedef unsigned char __v64qu __attribute__ ((__vector_size__ (64)));
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+   vector types, and their scalar components.  */
+typedef float __m512 __attribute__ ((__vector_size__ (64), __may_alias__));
+typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__));
+typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__));
+
+/* Unaligned version of the same type.  */
+typedef float __m512_u __attribute__ ((__vector_size__ (64), __may_alias__, __aligned__ (1)));
+typedef long long __m512i_u __attribute__ ((__vector_size__ (64), __may_alias__, __aligned__ (1)));
+typedef double __m512d_u __attribute__ ((__vector_size__ (64), __may_alias__, __aligned__ (1)));
+
+typedef unsigned char  __mmask8;
+typedef unsigned short __mmask16;
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_int2mask (int __M)
+{
+  return (__mmask16) __M;
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask2int (__mmask16 __M)
+{
+  return (int) __M;
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set_epi64 (long long __A, long long __B, long long __C,
+                 long long __D, long long __E, long long __F,
+                 long long __G, long long __H)
+{
+  return __extension__ (__m512i) (__v8di)
+        { __H, __G, __F, __E, __D, __C, __B, __A };
+}
+
+/* Create the vector [A B C D E F G H I J K L M N O P].  */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set_epi32 (int __A, int __B, int __C, int __D,
+                 int __E, int __F, int __G, int __H,
+                 int __I, int __J, int __K, int __L,
+                 int __M, int __N, int __O, int __P)
+{
+  return __extension__ (__m512i)(__v16si)
+        { __P, __O, __N, __M, __L, __K, __J, __I,
+          __H, __G, __F, __E, __D, __C, __B, __A };
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set_epi16 (short __q31, short __q30, short __q29, short __q28,
+                 short __q27, short __q26, short __q25, short __q24,
+                 short __q23, short __q22, short __q21, short __q20,
+                 short __q19, short __q18, short __q17, short __q16,
+                 short __q15, short __q14, short __q13, short __q12,
+                 short __q11, short __q10, short __q09, short __q08,
+                 short __q07, short __q06, short __q05, short __q04,
+                 short __q03, short __q02, short __q01, short __q00)
+{
+  return __extension__ (__m512i)(__v32hi){
+    __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
+    __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
+    __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
+    __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31
+  };
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set_epi8 (char __q63, char __q62, char __q61, char __q60,
+                char __q59, char __q58, char __q57, char __q56,
+                char __q55, char __q54, char __q53, char __q52,
+                char __q51, char __q50, char __q49, char __q48,
+                char __q47, char __q46, char __q45, char __q44,
+                char __q43, char __q42, char __q41, char __q40,
+                char __q39, char __q38, char __q37, char __q36,
+                char __q35, char __q34, char __q33, char __q32,
+                char __q31, char __q30, char __q29, char __q28,
+                char __q27, char __q26, char __q25, char __q24,
+                char __q23, char __q22, char __q21, char __q20,
+                char __q19, char __q18, char __q17, char __q16,
+                char __q15, char __q14, char __q13, char __q12,
+                char __q11, char __q10, char __q09, char __q08,
+                char __q07, char __q06, char __q05, char __q04,
+                char __q03, char __q02, char __q01, char __q00)
+{
+  return __extension__ (__m512i)(__v64qi){
+    __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
+    __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
+    __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
+    __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31,
+    __q32, __q33, __q34, __q35, __q36, __q37, __q38, __q39,
+    __q40, __q41, __q42, __q43, __q44, __q45, __q46, __q47,
+    __q48, __q49, __q50, __q51, __q52, __q53, __q54, __q55,
+    __q56, __q57, __q58, __q59, __q60, __q61, __q62, __q63
+  };
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set_pd (double __A, double __B, double __C, double __D,
+              double __E, double __F, double __G, double __H)
+{
+  return __extension__ (__m512d)
+        { __H, __G, __F, __E, __D, __C, __B, __A };
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set_ps (float __A, float __B, float __C, float __D,
+              float __E, float __F, float __G, float __H,
+              float __I, float __J, float __K, float __L,
+              float __M, float __N, float __O, float __P)
+{
+  return __extension__ (__m512)
+        { __P, __O, __N, __M, __L, __K, __J, __I,
+          __H, __G, __F, __E, __D, __C, __B, __A };
+}
+
+#define _mm512_setr_epi64(e0,e1,e2,e3,e4,e5,e6,e7)                           \
+  _mm512_set_epi64(e7,e6,e5,e4,e3,e2,e1,e0)
+
+#define _mm512_setr_epi32(e0,e1,e2,e3,e4,e5,e6,e7,                           \
+                         e8,e9,e10,e11,e12,e13,e14,e15)                      \
+  _mm512_set_epi32(e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0)
+
+#define _mm512_setr_pd(e0,e1,e2,e3,e4,e5,e6,e7)                                      \
+  _mm512_set_pd(e7,e6,e5,e4,e3,e2,e1,e0)
+
+#define _mm512_setr_ps(e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15) \
+  _mm512_set_ps(e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0)
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_undefined_ps (void)
+{
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Winit-self"
+  __m512 __Y = __Y;
+#pragma GCC diagnostic pop
+  return __Y;
+}
+
+#define _mm512_undefined _mm512_undefined_ps
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_undefined_pd (void)
+{
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Winit-self"
+  __m512d __Y = __Y;
+#pragma GCC diagnostic pop
+  return __Y;
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_undefined_epi32 (void)
+{
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Winit-self"
+  __m512i __Y = __Y;
+#pragma GCC diagnostic pop
+  return __Y;
+}
+
+#define _mm512_undefined_si512 _mm512_undefined_epi32
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set1_epi8 (char __A)
+{
+  return __extension__ (__m512i)(__v64qi)
+        { __A, __A, __A, __A, __A, __A, __A, __A,
+          __A, __A, __A, __A, __A, __A, __A, __A,
+          __A, __A, __A, __A, __A, __A, __A, __A,
+          __A, __A, __A, __A, __A, __A, __A, __A,
+          __A, __A, __A, __A, __A, __A, __A, __A,
+          __A, __A, __A, __A, __A, __A, __A, __A,
+          __A, __A, __A, __A, __A, __A, __A, __A,
+          __A, __A, __A, __A, __A, __A, __A, __A };
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set1_epi16 (short __A)
+{
+  return __extension__ (__m512i)(__v32hi)
+        { __A, __A, __A, __A, __A, __A, __A, __A,
+          __A, __A, __A, __A, __A, __A, __A, __A,
+          __A, __A, __A, __A, __A, __A, __A, __A,
+          __A, __A, __A, __A, __A, __A, __A, __A };
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set1_pd (double __A)
+{
+  return __extension__ (__m512d)(__v8df)
+    { __A, __A, __A, __A, __A, __A, __A, __A };
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set1_ps (float __A)
+{
+  return __extension__ (__m512)(__v16sf)
+    { __A, __A, __A, __A, __A, __A, __A, __A,
+      __A, __A, __A, __A, __A, __A, __A, __A };
+}
+
+/* Create the vector [A B C D A B C D A B C D A B C D].  */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set4_epi32 (int __A, int __B, int __C, int __D)
+{
+  return __extension__ (__m512i)(__v16si)
+        { __D, __C, __B, __A, __D, __C, __B, __A,
+          __D, __C, __B, __A, __D, __C, __B, __A };
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set4_epi64 (long long __A, long long __B, long long __C,
+                  long long __D)
+{
+  return __extension__ (__m512i) (__v8di)
+        { __D, __C, __B, __A, __D, __C, __B, __A };
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set4_pd (double __A, double __B, double __C, double __D)
+{
+  return __extension__ (__m512d)
+        { __D, __C, __B, __A, __D, __C, __B, __A };
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set4_ps (float __A, float __B, float __C, float __D)
+{
+  return __extension__ (__m512)
+        { __D, __C, __B, __A, __D, __C, __B, __A,
+          __D, __C, __B, __A, __D, __C, __B, __A };
+}
+
+#define _mm512_setr4_epi64(e0,e1,e2,e3)                                              \
+  _mm512_set4_epi64(e3,e2,e1,e0)
+
+#define _mm512_setr4_epi32(e0,e1,e2,e3)                                              \
+  _mm512_set4_epi32(e3,e2,e1,e0)
+
+#define _mm512_setr4_pd(e0,e1,e2,e3)                                         \
+  _mm512_set4_pd(e3,e2,e1,e0)
+
+#define _mm512_setr4_ps(e0,e1,e2,e3)                                         \
+  _mm512_set4_ps(e3,e2,e1,e0)
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_setzero_ps (void)
+{
+  return __extension__ (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+                                0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_setzero (void)
+{
+  return _mm512_setzero_ps ();
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_setzero_pd (void)
+{
+  return __extension__ (__m512d) { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_setzero_epi32 (void)
+{
+  return __extension__ (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_setzero_si512 (void)
+{
+  return __extension__ (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 };
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_movapd512_mask ((__v8df) __A,
+                                                 (__v8df) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mov_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_movapd512_mask ((__v8df) __A,
+                                                 (__v8df)
+                                                 _mm512_setzero_pd (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_movaps512_mask ((__v16sf) __A,
+                                                (__v16sf) __W,
+                                                (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mov_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_movaps512_mask ((__v16sf) __A,
+                                                (__v16sf)
+                                                _mm512_setzero_ps (),
+                                                (__mmask16) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_load_pd (void const *__P)
+{
+  return *(__m512d *) __P;
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_load_pd (__m512d __W, __mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P,
+                                                  (__v8df) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_load_pd (__mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P,
+                                                  (__v8df)
+                                                  _mm512_setzero_pd (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_store_pd (void *__P, __m512d __A)
+{
+  *(__m512d *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_store_pd (void *__P, __mmask8 __U, __m512d __A)
+{
+  __builtin_ia32_storeapd512_mask ((__v8df *) __P, (__v8df) __A,
+                                  (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_load_ps (void const *__P)
+{
+  return *(__m512 *) __P;
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_load_ps (__m512 __W, __mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P,
+                                                 (__v16sf) __W,
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_load_ps (__mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P,
+                                                 (__v16sf)
+                                                 _mm512_setzero_ps (),
+                                                 (__mmask16) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_store_ps (void *__P, __m512 __A)
+{
+  *(__m512 *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_store_ps (void *__P, __mmask16 __U, __m512 __A)
+{
+  __builtin_ia32_storeaps512_mask ((__v16sf *) __P, (__v16sf) __A,
+                                  (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mov_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_movdqa64_512_mask ((__v8di) __A,
+                                                    (__v8di) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mov_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_movdqa64_512_mask ((__v8di) __A,
+                                                    (__v8di)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_load_epi64 (void const *__P)
+{
+  return *(__m512i *) __P;
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_load_epi64 (__m512i __W, __mmask8 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
+                                                       (__v8di) __W,
+                                                       (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_load_epi64 (__mmask8 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
+                                                       (__v8di)
+                                                       _mm512_setzero_si512 (),
+                                                       (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_store_epi64 (void *__P, __m512i __A)
+{
+  *(__m512i *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_store_epi64 (void *__P, __mmask8 __U, __m512i __A)
+{
+  __builtin_ia32_movdqa64store512_mask ((__v8di *) __P, (__v8di) __A,
+                                       (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mov_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_movdqa32_512_mask ((__v16si) __A,
+                                                    (__v16si) __W,
+                                                    (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mov_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_movdqa32_512_mask ((__v16si) __A,
+                                                    (__v16si)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_load_si512 (void const *__P)
+{
+  return *(__m512i *) __P;
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_load_epi32 (void const *__P)
+{
+  return *(__m512i *) __P;
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
+                                                       (__v16si) __W,
+                                                       (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_load_epi32 (__mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
+                                                       (__v16si)
+                                                       _mm512_setzero_si512 (),
+                                                       (__mmask16) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_store_si512 (void *__P, __m512i __A)
+{
+  *(__m512i *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_store_epi32 (void *__P, __m512i __A)
+{
+  *(__m512i *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_store_epi32 (void *__P, __mmask16 __U, __m512i __A)
+{
+  __builtin_ia32_movdqa32store512_mask ((__v16si *) __P, (__v16si) __A,
+                                       (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mullo_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v16su) __A * (__v16su) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mullo_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A,
+                                                 (__v16si) __B,
+                                                 (__v16si)
+                                                 _mm512_setzero_si512 (),
+                                                 __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mullo_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A,
+                                                 (__v16si) __B,
+                                                 (__v16si) __W, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mullox_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v8du) __A * (__v8du) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mullox_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+  return _mm512_mask_mov_epi64 (__W, __M, _mm512_mullox_epi64 (__A, __B));
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sllv_epi32 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X,
+                                                 (__v16si) __Y,
+                                                 (__v16si)
+                                                 _mm512_undefined_epi32 (),
+                                                 (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sllv_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X,
+                                                 (__v16si) __Y,
+                                                 (__v16si) __W,
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sllv_epi32 (__mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X,
+                                                 (__v16si) __Y,
+                                                 (__v16si)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srav_epi32 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X,
+                                                 (__v16si) __Y,
+                                                 (__v16si)
+                                                 _mm512_undefined_epi32 (),
+                                                 (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srav_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X,
+                                                 (__v16si) __Y,
+                                                 (__v16si) __W,
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srav_epi32 (__mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X,
+                                                 (__v16si) __Y,
+                                                 (__v16si)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srlv_epi32 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X,
+                                                 (__v16si) __Y,
+                                                 (__v16si)
+                                                 _mm512_undefined_epi32 (),
+                                                 (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srlv_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X,
+                                                 (__v16si) __Y,
+                                                 (__v16si) __W,
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srlv_epi32 (__mmask16 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X,
+                                                 (__v16si) __Y,
+                                                 (__v16si)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_add_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v8du) __A + (__v8du) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_add_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A,
+                                                (__v8di) __B,
+                                                (__v8di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_add_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A,
+                                                (__v8di) __B,
+                                                (__v8di)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sub_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v8du) __A - (__v8du) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sub_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A,
+                                                (__v8di) __B,
+                                                (__v8di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sub_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A,
+                                                (__v8di) __B,
+                                                (__v8di)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sllv_epi64 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X,
+                                                (__v8di) __Y,
+                                                (__v8di)
+                                                _mm512_undefined_pd (),
+                                                (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sllv_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X,
+                                                (__v8di) __Y,
+                                                (__v8di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sllv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X,
+                                                (__v8di) __Y,
+                                                (__v8di)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srav_epi64 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X,
+                                                (__v8di) __Y,
+                                                (__v8di)
+                                                _mm512_undefined_epi32 (),
+                                                (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srav_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X,
+                                                (__v8di) __Y,
+                                                (__v8di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srav_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X,
+                                                (__v8di) __Y,
+                                                (__v8di)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srlv_epi64 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X,
+                                                (__v8di) __Y,
+                                                (__v8di)
+                                                _mm512_undefined_epi32 (),
+                                                (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srlv_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X,
+                                                (__v8di) __Y,
+                                                (__v8di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srlv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X,
+                                                (__v8di) __Y,
+                                                (__v8di)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_add_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v16su) __A + (__v16su) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_add_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A,
+                                                (__v16si) __B,
+                                                (__v16si) __W,
+                                                (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A,
+                                                (__v16si) __B,
+                                                (__v16si)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mul_epi32 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
+                                                 (__v16si) __Y,
+                                                 (__v8di)
+                                                 _mm512_undefined_epi32 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mul_epi32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
+                                                 (__v16si) __Y,
+                                                 (__v8di) __W, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mul_epi32 (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X,
+                                                 (__v16si) __Y,
+                                                 (__v8di)
+                                                 _mm512_setzero_si512 (),
+                                                 __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sub_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v16su) __A - (__v16su) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sub_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A,
+                                                (__v16si) __B,
+                                                (__v16si) __W,
+                                                (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sub_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A,
+                                                (__v16si) __B,
+                                                (__v16si)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mul_epu32 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
+                                                  (__v16si) __Y,
+                                                  (__v8di)
+                                                  _mm512_undefined_epi32 (),
+                                                  (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mul_epu32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
+                                                  (__v16si) __Y,
+                                                  (__v8di) __W, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mul_epu32 (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X,
+                                                  (__v16si) __Y,
+                                                  (__v8di)
+                                                  _mm512_setzero_si512 (),
+                                                  __M);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_slli_epi64 (__m512i __A, unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_psllqi512_mask ((__v8di) __A, __B,
+                                                 (__v8di)
+                                                 _mm512_undefined_epi32 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_slli_epi64 (__m512i __W, __mmask8 __U, __m512i __A,
+                       unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_psllqi512_mask ((__v8di) __A, __B,
+                                                 (__v8di) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_slli_epi64 (__mmask8 __U, __m512i __A, unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_psllqi512_mask ((__v8di) __A, __B,
+                                                 (__v8di)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask8) __U);
+}
+#else
+#define _mm512_slli_epi64(X, C)                                                   \
+  ((__m512i) __builtin_ia32_psllqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
+    (__v8di)(__m512i)_mm512_undefined_epi32 (),\
+    (__mmask8)-1))
+
+#define _mm512_mask_slli_epi64(W, U, X, C)                                \
+  ((__m512i) __builtin_ia32_psllqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
+    (__v8di)(__m512i)(W),\
+    (__mmask8)(U)))
+
+#define _mm512_maskz_slli_epi64(U, X, C)                                   \
+  ((__m512i) __builtin_ia32_psllqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
+    (__v8di)(__m512i)_mm512_setzero_si512 (),\
+    (__mmask8)(U)))
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sll_epi64 (__m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A,
+                                                (__v2di) __B,
+                                                (__v8di)
+                                                _mm512_undefined_epi32 (),
+                                                (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sll_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A,
+                                                (__v2di) __B,
+                                                (__v8di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sll_epi64 (__mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A,
+                                                (__v2di) __B,
+                                                (__v8di)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask8) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srli_epi64 (__m512i __A, unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_psrlqi512_mask ((__v8di) __A, __B,
+                                                 (__v8di)
+                                                 _mm512_undefined_epi32 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srli_epi64 (__m512i __W, __mmask8 __U,
+                       __m512i __A, unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_psrlqi512_mask ((__v8di) __A, __B,
+                                                 (__v8di) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srli_epi64 (__mmask8 __U, __m512i __A, unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_psrlqi512_mask ((__v8di) __A, __B,
+                                                 (__v8di)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask8) __U);
+}
+#else
+#define _mm512_srli_epi64(X, C)                                                   \
+  ((__m512i) __builtin_ia32_psrlqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
+    (__v8di)(__m512i)_mm512_undefined_epi32 (),\
+    (__mmask8)-1))
+
+#define _mm512_mask_srli_epi64(W, U, X, C)                                \
+  ((__m512i) __builtin_ia32_psrlqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
+    (__v8di)(__m512i)(W),\
+    (__mmask8)(U)))
+
+#define _mm512_maskz_srli_epi64(U, X, C)                                   \
+  ((__m512i) __builtin_ia32_psrlqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
+    (__v8di)(__m512i)_mm512_setzero_si512 (),\
+    (__mmask8)(U)))
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srl_epi64 (__m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A,
+                                                (__v2di) __B,
+                                                (__v8di)
+                                                _mm512_undefined_epi32 (),
+                                                (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srl_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A,
+                                                (__v2di) __B,
+                                                (__v8di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srl_epi64 (__mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A,
+                                                (__v2di) __B,
+                                                (__v8di)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask8) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srai_epi64 (__m512i __A, unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_psraqi512_mask ((__v8di) __A, __B,
+                                                 (__v8di)
+                                                 _mm512_undefined_epi32 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srai_epi64 (__m512i __W, __mmask8 __U, __m512i __A,
+                       unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_psraqi512_mask ((__v8di) __A, __B,
+                                                 (__v8di) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srai_epi64 (__mmask8 __U, __m512i __A, unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_psraqi512_mask ((__v8di) __A, __B,
+                                                 (__v8di)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask8) __U);
+}
+#else
+#define _mm512_srai_epi64(X, C)                                                   \
+  ((__m512i) __builtin_ia32_psraqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
+    (__v8di)(__m512i)_mm512_undefined_epi32 (),\
+    (__mmask8)-1))
+
+#define _mm512_mask_srai_epi64(W, U, X, C)                                \
+  ((__m512i) __builtin_ia32_psraqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
+    (__v8di)(__m512i)(W),\
+    (__mmask8)(U)))
+
+#define _mm512_maskz_srai_epi64(U, X, C)                                  \
+  ((__m512i) __builtin_ia32_psraqi512_mask ((__v8di)(__m512i)(X), (int)(C),\
+    (__v8di)(__m512i)_mm512_setzero_si512 (),\
+    (__mmask8)(U)))
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sra_epi64 (__m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A,
+                                                (__v2di) __B,
+                                                (__v8di)
+                                                _mm512_undefined_epi32 (),
+                                                (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sra_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A,
+                                                (__v2di) __B,
+                                                (__v8di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sra_epi64 (__mmask8 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A,
+                                                (__v2di) __B,
+                                                (__v8di)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask8) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_slli_epi32 (__m512i __A, unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_pslldi512_mask ((__v16si) __A, __B,
+                                                 (__v16si)
+                                                 _mm512_undefined_epi32 (),
+                                                 (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_slli_epi32 (__m512i __W, __mmask16 __U, __m512i __A,
+                       unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_pslldi512_mask ((__v16si) __A, __B,
+                                                 (__v16si) __W,
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_slli_epi32 (__mmask16 __U, __m512i __A, unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_pslldi512_mask ((__v16si) __A, __B,
+                                                 (__v16si)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask16) __U);
+}
+#else
+#define _mm512_slli_epi32(X, C)                                                    \
+  ((__m512i) __builtin_ia32_pslldi512_mask ((__v16si)(__m512i)(X), (int)(C),\
+    (__v16si)(__m512i)_mm512_undefined_epi32 (),\
+    (__mmask16)-1))
+
+#define _mm512_mask_slli_epi32(W, U, X, C)                                  \
+  ((__m512i) __builtin_ia32_pslldi512_mask ((__v16si)(__m512i)(X), (int)(C),\
+    (__v16si)(__m512i)(W),\
+    (__mmask16)(U)))
+
+#define _mm512_maskz_slli_epi32(U, X, C)                                    \
+  ((__m512i) __builtin_ia32_pslldi512_mask ((__v16si)(__m512i)(X), (int)(C),\
+    (__v16si)(__m512i)_mm512_setzero_si512 (),\
+    (__mmask16)(U)))
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sll_epi32 (__m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A,
+                                                (__v4si) __B,
+                                                (__v16si)
+                                                _mm512_undefined_epi32 (),
+                                                (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sll_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A,
+                                                (__v4si) __B,
+                                                (__v16si) __W,
+                                                (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sll_epi32 (__mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A,
+                                                (__v4si) __B,
+                                                (__v16si)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask16) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srli_epi32 (__m512i __A, unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_psrldi512_mask ((__v16si) __A, __B,
+                                                 (__v16si)
+                                                 _mm512_undefined_epi32 (),
+                                                 (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srli_epi32 (__m512i __W, __mmask16 __U,
+                       __m512i __A, unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_psrldi512_mask ((__v16si) __A, __B,
+                                                 (__v16si) __W,
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srli_epi32 (__mmask16 __U, __m512i __A, unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_psrldi512_mask ((__v16si) __A, __B,
+                                                 (__v16si)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask16) __U);
+}
+#else
+#define _mm512_srli_epi32(X, C)                                                    \
+  ((__m512i) __builtin_ia32_psrldi512_mask ((__v16si)(__m512i)(X), (int)(C),\
+    (__v16si)(__m512i)_mm512_undefined_epi32 (),\
+    (__mmask16)-1))
+
+#define _mm512_mask_srli_epi32(W, U, X, C)                                  \
+  ((__m512i) __builtin_ia32_psrldi512_mask ((__v16si)(__m512i)(X), (int)(C),\
+    (__v16si)(__m512i)(W),\
+    (__mmask16)(U)))
+
+#define _mm512_maskz_srli_epi32(U, X, C)                                   \
+  ((__m512i) __builtin_ia32_psrldi512_mask ((__v16si)(__m512i)(X), (int)(C),\
+    (__v16si)(__m512i)_mm512_setzero_si512 (),\
+    (__mmask16)(U)))
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srl_epi32 (__m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A,
+                                                (__v4si) __B,
+                                                (__v16si)
+                                                _mm512_undefined_epi32 (),
+                                                (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srl_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A,
+                                                (__v4si) __B,
+                                                (__v16si) __W,
+                                                (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srl_epi32 (__mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A,
+                                                (__v4si) __B,
+                                                (__v16si)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask16) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_srai_epi32 (__m512i __A, unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_psradi512_mask ((__v16si) __A, __B,
+                                                 (__v16si)
+                                                 _mm512_undefined_epi32 (),
+                                                 (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_srai_epi32 (__m512i __W, __mmask16 __U, __m512i __A,
+                       unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_psradi512_mask ((__v16si) __A, __B,
+                                                 (__v16si) __W,
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_srai_epi32 (__mmask16 __U, __m512i __A, unsigned int __B)
+{
+  return (__m512i) __builtin_ia32_psradi512_mask ((__v16si) __A, __B,
+                                                 (__v16si)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask16) __U);
+}
+#else
+#define _mm512_srai_epi32(X, C)                                                    \
+  ((__m512i) __builtin_ia32_psradi512_mask ((__v16si)(__m512i)(X), (int)(C),\
+    (__v16si)(__m512i)_mm512_undefined_epi32 (),\
+    (__mmask16)-1))
+
+#define _mm512_mask_srai_epi32(W, U, X, C)                                 \
+  ((__m512i) __builtin_ia32_psradi512_mask ((__v16si)(__m512i)(X), (int)(C),\
+    (__v16si)(__m512i)(W),\
+    (__mmask16)(U)))
+
+#define _mm512_maskz_srai_epi32(U, X, C)                                   \
+  ((__m512i) __builtin_ia32_psradi512_mask ((__v16si)(__m512i)(X), (int)(C),\
+    (__v16si)(__m512i)_mm512_setzero_si512 (),\
+    (__mmask16)(U)))
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sra_epi32 (__m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A,
+                                                (__v4si) __B,
+                                                (__v16si)
+                                                _mm512_undefined_epi32 (),
+                                                (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sra_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A,
+                                                (__v4si) __B,
+                                                (__v16si) __W,
+                                                (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sra_epi32 (__mmask16 __U, __m512i __A, __m128i __B)
+{
+  return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A,
+                                                (__v4si) __B,
+                                                (__v16si)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask16) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_round_sd (__m128d __A, __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_addsd_round ((__v2df) __A,
+                                              (__v2df) __B,
+                                              __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_add_round_sd (__m128d __W, __mmask8 __U, __m128d __A,
+                         __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df) __W,
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_add_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
+                          const int __R)
+{
+  return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df)
+                                                _mm_setzero_pd (),
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_round_ss (__m128 __A, __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_addss_round ((__v4sf) __A,
+                                             (__v4sf) __B,
+                                             __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_add_round_ss (__m128 __W, __mmask8 __U, __m128 __A,
+                         __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A,
+                                                (__v4sf) __B,
+                                                (__v4sf) __W,
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_add_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
+                          const int __R)
+{
+  return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A,
+                                                (__v4sf) __B,
+                                                (__v4sf)
+                                                _mm_setzero_ps (),
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_round_sd (__m128d __A, __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_subsd_round ((__v2df) __A,
+                                              (__v2df) __B,
+                                              __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sub_round_sd (__m128d __W, __mmask8 __U, __m128d __A,
+                         __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df) __W,
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sub_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
+                          const int __R)
+{
+  return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df)
+                                                _mm_setzero_pd (),
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_round_ss (__m128 __A, __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_subss_round ((__v4sf) __A,
+                                             (__v4sf) __B,
+                                             __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sub_round_ss (__m128 __W, __mmask8 __U, __m128 __A,
+                         __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A,
+                                                (__v4sf) __B,
+                                                (__v4sf) __W,
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sub_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
+                          const int __R)
+{
+  return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A,
+                                                (__v4sf) __B,
+                                                (__v4sf)
+                                                _mm_setzero_ps (),
+                                                (__mmask8) __U, __R);
+}
+
+#else
+#define _mm_add_round_sd(A, B, C)            \
+    (__m128d)__builtin_ia32_addsd_round(A, B, C)
+
+#define _mm_mask_add_round_sd(W, U, A, B, C) \
+    (__m128d)__builtin_ia32_addsd_mask_round(A, B, W, U, C)
+
+#define _mm_maskz_add_round_sd(U, A, B, C)   \
+    (__m128d)__builtin_ia32_addsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C)
+
+#define _mm_add_round_ss(A, B, C)            \
+    (__m128)__builtin_ia32_addss_round(A, B, C)
+
+#define _mm_mask_add_round_ss(W, U, A, B, C) \
+    (__m128)__builtin_ia32_addss_mask_round(A, B, W, U, C)
+
+#define _mm_maskz_add_round_ss(U, A, B, C)   \
+    (__m128)__builtin_ia32_addss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C)
+
+#define _mm_sub_round_sd(A, B, C)            \
+    (__m128d)__builtin_ia32_subsd_round(A, B, C)
+
+#define _mm_mask_sub_round_sd(W, U, A, B, C) \
+    (__m128d)__builtin_ia32_subsd_mask_round(A, B, W, U, C)
+
+#define _mm_maskz_sub_round_sd(U, A, B, C)   \
+    (__m128d)__builtin_ia32_subsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C)
+
+#define _mm_sub_round_ss(A, B, C)            \
+    (__m128)__builtin_ia32_subss_round(A, B, C)
+
+#define _mm_mask_sub_round_ss(W, U, A, B, C) \
+    (__m128)__builtin_ia32_subss_mask_round(A, B, W, U, C)
+
+#define _mm_maskz_sub_round_ss(U, A, B, C)   \
+    (__m128)__builtin_ia32_subss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C)
+
+#endif
+
+/* Constant helper to represent the ternary logic operations among
+   vector A, B and C.  */
+typedef enum
+{
+  _MM_TERNLOG_A = 0xF0,
+  _MM_TERNLOG_B = 0xCC,
+  _MM_TERNLOG_C = 0xAA
+} _MM_TERNLOG_ENUM;
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_ternarylogic_epi64 (__m512i __A, __m512i __B, __m512i __C,
+                          const int __imm)
+{
+  return (__m512i)
+    __builtin_ia32_pternlogq512_mask ((__v8di) __A,
+                                     (__v8di) __B,
+                                     (__v8di) __C,
+                                     (unsigned char) __imm,
+                                     (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_ternarylogic_epi64 (__m512i __A, __mmask8 __U, __m512i __B,
+                               __m512i __C, const int __imm)
+{
+  return (__m512i)
+    __builtin_ia32_pternlogq512_mask ((__v8di) __A,
+                                     (__v8di) __B,
+                                     (__v8di) __C,
+                                     (unsigned char) __imm,
+                                     (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_ternarylogic_epi64 (__mmask8 __U, __m512i __A, __m512i __B,
+                                __m512i __C, const int __imm)
+{
+  return (__m512i)
+    __builtin_ia32_pternlogq512_maskz ((__v8di) __A,
+                                      (__v8di) __B,
+                                      (__v8di) __C,
+                                      (unsigned char) __imm,
+                                      (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_ternarylogic_epi32 (__m512i __A, __m512i __B, __m512i __C,
+                          const int __imm)
+{
+  return (__m512i)
+    __builtin_ia32_pternlogd512_mask ((__v16si) __A,
+                                     (__v16si) __B,
+                                     (__v16si) __C,
+                                     (unsigned char) __imm,
+                                     (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_ternarylogic_epi32 (__m512i __A, __mmask16 __U, __m512i __B,
+                               __m512i __C, const int __imm)
+{
+  return (__m512i)
+    __builtin_ia32_pternlogd512_mask ((__v16si) __A,
+                                     (__v16si) __B,
+                                     (__v16si) __C,
+                                     (unsigned char) __imm,
+                                     (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_ternarylogic_epi32 (__mmask16 __U, __m512i __A, __m512i __B,
+                                __m512i __C, const int __imm)
+{
+  return (__m512i)
+    __builtin_ia32_pternlogd512_maskz ((__v16si) __A,
+                                      (__v16si) __B,
+                                      (__v16si) __C,
+                                      (unsigned char) __imm,
+                                      (__mmask16) __U);
+}
+#else
+#define _mm512_ternarylogic_epi64(A, B, C, I)                  \
+  ((__m512i)                                                   \
+   __builtin_ia32_pternlogq512_mask ((__v8di) (__m512i) (A),   \
+                                    (__v8di) (__m512i) (B),    \
+                                    (__v8di) (__m512i) (C),    \
+                                    (unsigned char) (I),       \
+                                    (__mmask8) -1))
+#define _mm512_mask_ternarylogic_epi64(A, U, B, C, I)          \
+  ((__m512i)                                                   \
+   __builtin_ia32_pternlogq512_mask ((__v8di) (__m512i) (A),   \
+                                    (__v8di) (__m512i) (B),    \
+                                    (__v8di) (__m512i) (C),    \
+                                    (unsigned char)(I),        \
+                                    (__mmask8) (U)))
+#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, I)         \
+  ((__m512i)                                                   \
+   __builtin_ia32_pternlogq512_maskz ((__v8di) (__m512i) (A),  \
+                                     (__v8di) (__m512i) (B),   \
+                                     (__v8di) (__m512i) (C),   \
+                                     (unsigned char) (I),      \
+                                     (__mmask8) (U)))
+#define _mm512_ternarylogic_epi32(A, B, C, I)                  \
+  ((__m512i)                                                   \
+   __builtin_ia32_pternlogd512_mask ((__v16si) (__m512i) (A),  \
+                                    (__v16si) (__m512i) (B),   \
+                                    (__v16si) (__m512i) (C),   \
+                                    (unsigned char) (I),       \
+                                    (__mmask16) -1))
+#define _mm512_mask_ternarylogic_epi32(A, U, B, C, I)          \
+  ((__m512i)                                                   \
+   __builtin_ia32_pternlogd512_mask ((__v16si) (__m512i) (A),  \
+                                    (__v16si) (__m512i) (B),   \
+                                    (__v16si) (__m512i) (C),   \
+                                    (unsigned char) (I),       \
+                                    (__mmask16) (U)))
+#define _mm512_maskz_ternarylogic_epi32(U, A, B, C, I)         \
+  ((__m512i)                                                   \
+   __builtin_ia32_pternlogd512_maskz ((__v16si) (__m512i) (A), \
+                                     (__v16si) (__m512i) (B),  \
+                                     (__v16si) (__m512i) (C),  \
+                                     (unsigned char) (I),      \
+                                     (__mmask16) (U)))
+#endif
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rcp14_pd (__m512d __A)
+{
+  return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
+                                                  (__v8df)
+                                                  _mm512_undefined_pd (),
+                                                  (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rcp14_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
+                                                  (__v8df) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rcp14_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
+                                                  (__v8df)
+                                                  _mm512_setzero_pd (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rcp14_ps (__m512 __A)
+{
+  return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
+                                                 (__v16sf)
+                                                 _mm512_undefined_ps (),
+                                                 (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rcp14_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
+                                                 (__v16sf) __W,
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
+                                                 (__v16sf)
+                                                 _mm512_setzero_ps (),
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp14_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_rcp14sd ((__v2df) __B,
+                                          (__v2df) __A);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rcp14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __B,
+                                               (__v2df) __A,
+                                               (__v2df) __W,
+                                               (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rcp14_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __B,
+                                               (__v2df) __A,
+                                               (__v2df) _mm_setzero_ps (),
+                                               (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp14_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_rcp14ss ((__v4sf) __B,
+                                         (__v4sf) __A);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rcp14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __B,
+                                               (__v4sf) __A,
+                                               (__v4sf) __W,
+                                               (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rcp14_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __B,
+                                               (__v4sf) __A,
+                                               (__v4sf) _mm_setzero_ps (),
+                                               (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rsqrt14_pd (__m512d __A)
+{
+  return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
+                                                    (__v8df)
+                                                    _mm512_undefined_pd (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rsqrt14_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
+                                                    (__v8df) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rsqrt14_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
+                                                    (__v8df)
+                                                    _mm512_setzero_pd (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rsqrt14_ps (__m512 __A)
+{
+  return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
+                                                   (__v16sf)
+                                                   _mm512_undefined_ps (),
+                                                   (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rsqrt14_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __W,
+                                                   (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
+                                                   (__v16sf)
+                                                   _mm512_setzero_ps (),
+                                                   (__mmask16) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt14_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_rsqrt14sd ((__v2df) __B,
+                                            (__v2df) __A);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rsqrt14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __B,
+                                                (__v2df) __A,
+                                                (__v2df) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rsqrt14_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __B,
+                                                (__v2df) __A,
+                                                (__v2df) _mm_setzero_pd (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt14_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_rsqrt14ss ((__v4sf) __B,
+                                           (__v4sf) __A);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rsqrt14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __B,
+                                                (__v4sf) __A,
+                                                (__v4sf) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rsqrt14_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __B,
+                                               (__v4sf) __A,
+                                               (__v4sf) _mm_setzero_ps (),
+                                               (__mmask8) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sqrt_round_pd (__m512d __A, const int __R)
+{
+  return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
+                                                 (__v8df)
+                                                 _mm512_undefined_pd (),
+                                                 (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sqrt_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
+                          const int __R)
+{
+  return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
+                                                 (__v8df) __W,
+                                                 (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sqrt_round_pd (__mmask8 __U, __m512d __A, const int __R)
+{
+  return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
+                                                 (__v8df)
+                                                 _mm512_setzero_pd (),
+                                                 (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sqrt_round_ps (__m512 __A, const int __R)
+{
+  return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A,
+                                                (__v16sf)
+                                                _mm512_undefined_ps (),
+                                                (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sqrt_round_ps (__m512 __W, __mmask16 __U, __m512 __A, const int __R)
+{
+  return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A,
+                                                (__v16sf) __W,
+                                                (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sqrt_round_ps (__mmask16 __U, __m512 __A, const int __R)
+{
+  return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A,
+                                                (__v16sf)
+                                                _mm512_setzero_ps (),
+                                                (__mmask16) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_round_sd (__m128d __A, __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_sqrtsd_mask_round ((__v2df) __B,
+                                                    (__v2df) __A,
+                                                    (__v2df)
+                                                    _mm_setzero_pd (),
+                                                    (__mmask8) -1, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sqrt_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B,
+                       const int __R)
+{
+  return (__m128d) __builtin_ia32_sqrtsd_mask_round ((__v2df) __B,
+                                                    (__v2df) __A,
+                                                    (__v2df) __W,
+                                                    (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sqrt_round_sd (__mmask8 __U, __m128d __A, __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_sqrtsd_mask_round ((__v2df) __B,
+                                                    (__v2df) __A,
+                                                    (__v2df)
+                                                    _mm_setzero_pd (),
+                                                    (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_round_ss (__m128 __A, __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_sqrtss_mask_round ((__v4sf) __B,
+                                                   (__v4sf) __A,
+                                                   (__v4sf)
+                                                   _mm_setzero_ps (),
+                                                   (__mmask8) -1, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sqrt_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B,
+                       const int __R)
+{
+  return (__m128) __builtin_ia32_sqrtss_mask_round ((__v4sf) __B,
+                                                   (__v4sf) __A,
+                                                   (__v4sf) __W,
+                                                   (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sqrt_round_ss (__mmask8 __U, __m128 __A, __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_sqrtss_mask_round ((__v4sf) __B,
+                                                   (__v4sf) __A,
+                                                   (__v4sf)
+                                                   _mm_setzero_ps (),
+                                                   (__mmask8) __U, __R);
+}
+#else
+#define _mm512_sqrt_round_pd(A, C)            \
+    (__m512d)__builtin_ia32_sqrtpd512_mask(A, (__v8df)_mm512_undefined_pd(), -1, C)
+
+#define _mm512_mask_sqrt_round_pd(W, U, A, C) \
+    (__m512d)__builtin_ia32_sqrtpd512_mask(A, W, U, C)
+
+#define _mm512_maskz_sqrt_round_pd(U, A, C)   \
+    (__m512d)__builtin_ia32_sqrtpd512_mask(A, (__v8df)_mm512_setzero_pd(), U, C)
+
+#define _mm512_sqrt_round_ps(A, C)            \
+    (__m512)__builtin_ia32_sqrtps512_mask(A, (__v16sf)_mm512_undefined_ps(), -1, C)
+
+#define _mm512_mask_sqrt_round_ps(W, U, A, C) \
+    (__m512)__builtin_ia32_sqrtps512_mask(A, W, U, C)
+
+#define _mm512_maskz_sqrt_round_ps(U, A, C)   \
+    (__m512)__builtin_ia32_sqrtps512_mask(A, (__v16sf)_mm512_setzero_ps(), U, C)
+
+#define _mm_sqrt_round_sd(A, B, C)           \
+    (__m128d)__builtin_ia32_sqrtsd_mask_round (B, A, \
+       (__v2df) _mm_setzero_pd (), -1, C)
+
+#define _mm_mask_sqrt_round_sd(W, U, A, B, C) \
+    (__m128d)__builtin_ia32_sqrtsd_mask_round (B, A, W, U, C)
+
+#define _mm_maskz_sqrt_round_sd(U, A, B, C)   \
+    (__m128d)__builtin_ia32_sqrtsd_mask_round (B, A, \
+       (__v2df) _mm_setzero_pd (), U, C)
+
+#define _mm_sqrt_round_ss(A, B, C)           \
+    (__m128)__builtin_ia32_sqrtss_mask_round (B, A, \
+       (__v4sf) _mm_setzero_ps (), -1, C)
+
+#define _mm_mask_sqrt_round_ss(W, U, A, B, C) \
+    (__m128)__builtin_ia32_sqrtss_mask_round (B, A, W, U, C)
+
+#define _mm_maskz_sqrt_round_ss(U, A, B, C)   \
+    (__m128)__builtin_ia32_sqrtss_mask_round (B, A, \
+       (__v4sf) _mm_setzero_ps (), U, C)
+#endif
+
+#define _mm_mask_sqrt_sd(W, U, A, B) \
+    _mm_mask_sqrt_round_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_sqrt_sd(U, A, B) \
+    _mm_maskz_sqrt_round_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_sqrt_ss(W, U, A, B) \
+    _mm_mask_sqrt_round_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_sqrt_ss(U, A, B) \
+    _mm_maskz_sqrt_round_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi8_epi32 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A,
+                                                   (__v16si)
+                                                   _mm512_undefined_epi32 (),
+                                                   (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi8_epi32 (__m512i __W, __mmask16 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A,
+                                                   (__v16si) __W,
+                                                   (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi8_epi32 (__mmask16 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A,
+                                                   (__v16si)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi8_epi64 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A,
+                                                   (__v8di)
+                                                   _mm512_undefined_epi32 (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi8_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A,
+                                                   (__v8di) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A,
+                                                   (__v8di)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi16_epi32 (__m256i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A,
+                                                   (__v16si)
+                                                   _mm512_undefined_epi32 (),
+                                                   (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi16_epi32 (__m512i __W, __mmask16 __U, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A,
+                                                   (__v16si) __W,
+                                                   (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi16_epi32 (__mmask16 __U, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A,
+                                                   (__v16si)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi16_epi64 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A,
+                                                   (__v8di)
+                                                   _mm512_undefined_epi32 (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi16_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A,
+                                                   (__v8di) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A,
+                                                   (__v8di)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi32_epi64 (__m256i __X)
+{
+  return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X,
+                                                   (__v8di)
+                                                   _mm512_undefined_epi32 (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi32_epi64 (__m512i __W, __mmask8 __U, __m256i __X)
+{
+  return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X,
+                                                   (__v8di) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi32_epi64 (__mmask8 __U, __m256i __X)
+{
+  return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X,
+                                                   (__v8di)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu8_epi32 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A,
+                                                   (__v16si)
+                                                   _mm512_undefined_epi32 (),
+                                                   (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu8_epi32 (__m512i __W, __mmask16 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A,
+                                                   (__v16si) __W,
+                                                   (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu8_epi32 (__mmask16 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A,
+                                                   (__v16si)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu8_epi64 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A,
+                                                   (__v8di)
+                                                   _mm512_undefined_epi32 (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu8_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A,
+                                                   (__v8di) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A,
+                                                   (__v8di)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu16_epi32 (__m256i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A,
+                                                   (__v16si)
+                                                   _mm512_undefined_epi32 (),
+                                                   (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu16_epi32 (__m512i __W, __mmask16 __U, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A,
+                                                   (__v16si) __W,
+                                                   (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu16_epi32 (__mmask16 __U, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A,
+                                                   (__v16si)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu16_epi64 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A,
+                                                   (__v8di)
+                                                   _mm512_undefined_epi32 (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu16_epi64 (__m512i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A,
+                                                   (__v8di) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A,
+                                                   (__v8di)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu32_epi64 (__m256i __X)
+{
+  return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X,
+                                                   (__v8di)
+                                                   _mm512_undefined_epi32 (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu32_epi64 (__m512i __W, __mmask8 __U, __m256i __X)
+{
+  return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X,
+                                                   (__v8di) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu32_epi64 (__mmask8 __U, __m256i __X)
+{
+  return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X,
+                                                   (__v8di)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask8) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_add_round_pd (__m512d __A, __m512d __B, const int __R)
+{
+  return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df)
+                                                _mm512_undefined_pd (),
+                                                (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_add_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
+                         __m512d __B, const int __R)
+{
+  return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df) __W,
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_add_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+                          const int __R)
+{
+  return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df)
+                                                _mm512_setzero_pd (),
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_add_round_ps (__m512 __A, __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf)
+                                               _mm512_undefined_ps (),
+                                               (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_add_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
+                         __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf) __W,
+                                               (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_add_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf)
+                                               _mm512_setzero_ps (),
+                                               (__mmask16) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sub_round_pd (__m512d __A, __m512d __B, const int __R)
+{
+  return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df)
+                                                _mm512_undefined_pd (),
+                                                (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sub_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
+                         __m512d __B, const int __R)
+{
+  return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df) __W,
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sub_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+                          const int __R)
+{
+  return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df)
+                                                _mm512_setzero_pd (),
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sub_round_ps (__m512 __A, __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf)
+                                               _mm512_undefined_ps (),
+                                               (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sub_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
+                         __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf) __W,
+                                               (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sub_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf)
+                                               _mm512_setzero_ps (),
+                                               (__mmask16) __U, __R);
+}
+#else
+#define _mm512_add_round_pd(A, B, C)            \
+    (__m512d)__builtin_ia32_addpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C)
+
+#define _mm512_mask_add_round_pd(W, U, A, B, C) \
+    (__m512d)__builtin_ia32_addpd512_mask(A, B, W, U, C)
+
+#define _mm512_maskz_add_round_pd(U, A, B, C)   \
+    (__m512d)__builtin_ia32_addpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C)
+
+#define _mm512_add_round_ps(A, B, C)            \
+    (__m512)__builtin_ia32_addps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C)
+
+#define _mm512_mask_add_round_ps(W, U, A, B, C) \
+    (__m512)__builtin_ia32_addps512_mask(A, B, W, U, C)
+
+#define _mm512_maskz_add_round_ps(U, A, B, C)   \
+    (__m512)__builtin_ia32_addps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C)
+
+#define _mm512_sub_round_pd(A, B, C)            \
+    (__m512d)__builtin_ia32_subpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C)
+
+#define _mm512_mask_sub_round_pd(W, U, A, B, C) \
+    (__m512d)__builtin_ia32_subpd512_mask(A, B, W, U, C)
+
+#define _mm512_maskz_sub_round_pd(U, A, B, C)   \
+    (__m512d)__builtin_ia32_subpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C)
+
+#define _mm512_sub_round_ps(A, B, C)            \
+    (__m512)__builtin_ia32_subps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C)
+
+#define _mm512_mask_sub_round_ps(W, U, A, B, C) \
+    (__m512)__builtin_ia32_subps512_mask(A, B, W, U, C)
+
+#define _mm512_maskz_sub_round_ps(U, A, B, C)   \
+    (__m512)__builtin_ia32_subps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C)
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mul_round_pd (__m512d __A, __m512d __B, const int __R)
+{
+  return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df)
+                                                _mm512_undefined_pd (),
+                                                (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mul_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
+                         __m512d __B, const int __R)
+{
+  return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df) __W,
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mul_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+                          const int __R)
+{
+  return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df)
+                                                _mm512_setzero_pd (),
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mul_round_ps (__m512 __A, __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf)
+                                               _mm512_undefined_ps (),
+                                               (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mul_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
+                         __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf) __W,
+                                               (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mul_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf)
+                                               _mm512_setzero_ps (),
+                                               (__mmask16) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_div_round_pd (__m512d __M, __m512d __V, const int __R)
+{
+  return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M,
+                                                (__v8df) __V,
+                                                (__v8df)
+                                                _mm512_undefined_pd (),
+                                                (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_div_round_pd (__m512d __W, __mmask8 __U, __m512d __M,
+                         __m512d __V, const int __R)
+{
+  return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M,
+                                                (__v8df) __V,
+                                                (__v8df) __W,
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_div_round_pd (__mmask8 __U, __m512d __M, __m512d __V,
+                          const int __R)
+{
+  return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M,
+                                                (__v8df) __V,
+                                                (__v8df)
+                                                _mm512_setzero_pd (),
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_div_round_ps (__m512 __A, __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf)
+                                               _mm512_undefined_ps (),
+                                               (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_div_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
+                         __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf) __W,
+                                               (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_div_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf)
+                                               _mm512_setzero_ps (),
+                                               (__mmask16) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_round_sd (__m128d __A, __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_mulsd_round ((__v2df) __A,
+                                              (__v2df) __B,
+                                              __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mul_round_sd (__m128d __W, __mmask8 __U, __m128d __A,
+                         __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df) __W,
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mul_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
+                          const int __R)
+{
+  return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df)
+                                                _mm_setzero_pd (),
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_round_ss (__m128 __A, __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_mulss_round ((__v4sf) __A,
+                                             (__v4sf) __B,
+                                             __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mul_round_ss (__m128 __W, __mmask8 __U, __m128 __A,
+                         __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A,
+                                                (__v4sf) __B,
+                                                (__v4sf) __W,
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mul_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
+                          const int __R)
+{
+  return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A,
+                                                (__v4sf) __B,
+                                                (__v4sf)
+                                                _mm_setzero_ps (),
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_round_sd (__m128d __A, __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_divsd_round ((__v2df) __A,
+                                              (__v2df) __B,
+                                              __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_div_round_sd (__m128d __W, __mmask8 __U, __m128d __A,
+                         __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df) __W,
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_div_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
+                          const int __R)
+{
+  return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df)
+                                                _mm_setzero_pd (),
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_round_ss (__m128 __A, __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_divss_round ((__v4sf) __A,
+                                             (__v4sf) __B,
+                                             __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_div_round_ss (__m128 __W, __mmask8 __U, __m128 __A,
+                         __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A,
+                                                (__v4sf) __B,
+                                                (__v4sf) __W,
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_div_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
+                          const int __R)
+{
+  return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A,
+                                                (__v4sf) __B,
+                                                (__v4sf)
+                                                _mm_setzero_ps (),
+                                                (__mmask8) __U, __R);
+}
+
+#else
+#define _mm512_mul_round_pd(A, B, C)            \
+    (__m512d)__builtin_ia32_mulpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C)
+
+#define _mm512_mask_mul_round_pd(W, U, A, B, C) \
+    (__m512d)__builtin_ia32_mulpd512_mask(A, B, W, U, C)
+
+#define _mm512_maskz_mul_round_pd(U, A, B, C)   \
+    (__m512d)__builtin_ia32_mulpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C)
+
+#define _mm512_mul_round_ps(A, B, C)            \
+    (__m512)__builtin_ia32_mulps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C)
+
+#define _mm512_mask_mul_round_ps(W, U, A, B, C) \
+    (__m512)__builtin_ia32_mulps512_mask(A, B, W, U, C)
+
+#define _mm512_maskz_mul_round_ps(U, A, B, C)   \
+    (__m512)__builtin_ia32_mulps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C)
+
+#define _mm512_div_round_pd(A, B, C)            \
+    (__m512d)__builtin_ia32_divpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C)
+
+#define _mm512_mask_div_round_pd(W, U, A, B, C) \
+    (__m512d)__builtin_ia32_divpd512_mask(A, B, W, U, C)
+
+#define _mm512_maskz_div_round_pd(U, A, B, C)   \
+    (__m512d)__builtin_ia32_divpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C)
+
+#define _mm512_div_round_ps(A, B, C)            \
+    (__m512)__builtin_ia32_divps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C)
+
+#define _mm512_mask_div_round_ps(W, U, A, B, C) \
+    (__m512)__builtin_ia32_divps512_mask(A, B, W, U, C)
+
+#define _mm512_maskz_div_round_ps(U, A, B, C)   \
+    (__m512)__builtin_ia32_divps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C)
+
+#define _mm_mul_round_sd(A, B, C)            \
+    (__m128d)__builtin_ia32_mulsd_round(A, B, C)
+
+#define _mm_mask_mul_round_sd(W, U, A, B, C) \
+    (__m128d)__builtin_ia32_mulsd_mask_round(A, B, W, U, C)
+
+#define _mm_maskz_mul_round_sd(U, A, B, C)   \
+    (__m128d)__builtin_ia32_mulsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C)
+
+#define _mm_mul_round_ss(A, B, C)            \
+    (__m128)__builtin_ia32_mulss_round(A, B, C)
+
+#define _mm_mask_mul_round_ss(W, U, A, B, C) \
+    (__m128)__builtin_ia32_mulss_mask_round(A, B, W, U, C)
+
+#define _mm_maskz_mul_round_ss(U, A, B, C)   \
+    (__m128)__builtin_ia32_mulss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C)
+
+#define _mm_div_round_sd(A, B, C)            \
+    (__m128d)__builtin_ia32_divsd_round(A, B, C)
+
+#define _mm_mask_div_round_sd(W, U, A, B, C) \
+    (__m128d)__builtin_ia32_divsd_mask_round(A, B, W, U, C)
+
+#define _mm_maskz_div_round_sd(U, A, B, C)   \
+    (__m128d)__builtin_ia32_divsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C)
+
+#define _mm_div_round_ss(A, B, C)            \
+    (__m128)__builtin_ia32_divss_round(A, B, C)
+
+#define _mm_mask_div_round_ss(W, U, A, B, C) \
+    (__m128)__builtin_ia32_divss_mask_round(A, B, W, U, C)
+
+#define _mm_maskz_div_round_ss(U, A, B, C)   \
+    (__m128)__builtin_ia32_divss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C)
+
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_round_pd (__m512d __A, __m512d __B, const int __R)
+{
+  return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df)
+                                                _mm512_undefined_pd (),
+                                                (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
+                         __m512d __B, const int __R)
+{
+  return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df) __W,
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+                          const int __R)
+{
+  return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df)
+                                                _mm512_setzero_pd (),
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_round_ps (__m512 __A, __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf)
+                                               _mm512_undefined_ps (),
+                                               (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
+                         __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf) __W,
+                                               (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf)
+                                               _mm512_setzero_ps (),
+                                               (__mmask16) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_round_pd (__m512d __A, __m512d __B, const int __R)
+{
+  return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df)
+                                                _mm512_undefined_pd (),
+                                                (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
+                         __m512d __B, const int __R)
+{
+  return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df) __W,
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+                          const int __R)
+{
+  return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df)
+                                                _mm512_setzero_pd (),
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_round_ps (__m512 __A, __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf)
+                                               _mm512_undefined_ps (),
+                                               (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
+                         __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf) __W,
+                                               (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf)
+                                               _mm512_setzero_ps (),
+                                               (__mmask16) __U, __R);
+}
+#else
+#define _mm512_max_round_pd(A, B,  R) \
+    (__m512d)__builtin_ia32_maxpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, R)
+
+#define _mm512_mask_max_round_pd(W, U,  A, B, R) \
+    (__m512d)__builtin_ia32_maxpd512_mask(A, B, W, U, R)
+
+#define _mm512_maskz_max_round_pd(U, A,  B, R) \
+    (__m512d)__builtin_ia32_maxpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, R)
+
+#define _mm512_max_round_ps(A, B,  R) \
+    (__m512)__builtin_ia32_maxps512_mask(A, B, (__v16sf)_mm512_undefined_pd(), -1, R)
+
+#define _mm512_mask_max_round_ps(W, U,  A, B, R) \
+    (__m512)__builtin_ia32_maxps512_mask(A, B, W, U, R)
+
+#define _mm512_maskz_max_round_ps(U, A,  B, R) \
+    (__m512)__builtin_ia32_maxps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, R)
+
+#define _mm512_min_round_pd(A, B,  R) \
+    (__m512d)__builtin_ia32_minpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, R)
+
+#define _mm512_mask_min_round_pd(W, U,  A, B, R) \
+    (__m512d)__builtin_ia32_minpd512_mask(A, B, W, U, R)
+
+#define _mm512_maskz_min_round_pd(U, A,  B, R) \
+    (__m512d)__builtin_ia32_minpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, R)
+
+#define _mm512_min_round_ps(A, B, R) \
+    (__m512)__builtin_ia32_minps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, R)
+
+#define _mm512_mask_min_round_ps(W, U,  A, B, R) \
+    (__m512)__builtin_ia32_minps512_mask(A, B, W, U, R)
+
+#define _mm512_maskz_min_round_ps(U, A,  B, R) \
+    (__m512)__builtin_ia32_minps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, R)
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_scalef_round_pd (__m512d __A, __m512d __B, const int __R)
+{
+  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+                                                   (__v8df) __B,
+                                                   (__v8df)
+                                                   _mm512_undefined_pd (),
+                                                   (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_scalef_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
+                            __m512d __B, const int __R)
+{
+  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+                                                   (__v8df) __B,
+                                                   (__v8df) __W,
+                                                   (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_scalef_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+                             const int __R)
+{
+  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+                                                   (__v8df) __B,
+                                                   (__v8df)
+                                                   _mm512_setzero_pd (),
+                                                   (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_scalef_round_ps (__m512 __A, __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+                                                  (__v16sf) __B,
+                                                  (__v16sf)
+                                                  _mm512_undefined_ps (),
+                                                  (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_scalef_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
+                            __m512 __B, const int __R)
+{
+  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+                                                  (__v16sf) __B,
+                                                  (__v16sf) __W,
+                                                  (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_scalef_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
+                             const int __R)
+{
+  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+                                                  (__v16sf) __B,
+                                                  (__v16sf)
+                                                  _mm512_setzero_ps (),
+                                                  (__mmask16) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_scalef_round_sd (__m128d __A, __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A,
+                                                      (__v2df) __B,
+                                                      (__v2df)
+                                                      _mm_setzero_pd (),
+                                                      (__mmask8) -1, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_scalef_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B,
+                         const int __R)
+{
+  return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A,
+                                                      (__v2df) __B,
+                                                      (__v2df) __W,
+                                                      (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_scalef_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
+                          const int __R)
+{
+  return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A,
+                                                      (__v2df) __B,
+                                                      (__v2df)
+                                                      _mm_setzero_pd (),
+                                                      (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_scalef_round_ss (__m128 __A, __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A,
+                                                     (__v4sf) __B,
+                                                     (__v4sf)
+                                                     _mm_setzero_ps (),
+                                                     (__mmask8) -1, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_scalef_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B,
+                        const int __R)
+{
+  return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A,
+                                                     (__v4sf) __B,
+                                                     (__v4sf) __W,
+                                                     (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_scalef_round_ss (__mmask8 __U, __m128 __A, __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A,
+                                                     (__v4sf) __B,
+                                                     (__v4sf)
+                                                     _mm_setzero_ps (),
+                                                     (__mmask8) __U, __R);
+}
+#else
+#define _mm512_scalef_round_pd(A, B, C)                                        \
+  ((__m512d)                                                           \
+   __builtin_ia32_scalefpd512_mask((A), (B),                           \
+                                  (__v8df) _mm512_undefined_pd(),      \
+                                  -1, (C)))
+
+#define _mm512_mask_scalef_round_pd(W, U, A, B, C)                     \
+  ((__m512d) __builtin_ia32_scalefpd512_mask((A), (B), (W), (U), (C)))
+
+#define _mm512_maskz_scalef_round_pd(U, A, B, C)                       \
+  ((__m512d)                                                           \
+   __builtin_ia32_scalefpd512_mask((A), (B),                           \
+                                  (__v8df) _mm512_setzero_pd(),        \
+                                  (U), (C)))
+
+#define _mm512_scalef_round_ps(A, B, C)                                        \
+  ((__m512)                                                            \
+   __builtin_ia32_scalefps512_mask((A), (B),                           \
+                                  (__v16sf) _mm512_undefined_ps(),     \
+                                  -1, (C)))
+
+#define _mm512_mask_scalef_round_ps(W, U, A, B, C)                     \
+  ((__m512) __builtin_ia32_scalefps512_mask((A), (B), (W), (U), (C)))
+
+#define _mm512_maskz_scalef_round_ps(U, A, B, C)                       \
+  ((__m512)                                                            \
+   __builtin_ia32_scalefps512_mask((A), (B),                           \
+                                  (__v16sf) _mm512_setzero_ps(),       \
+                                  (U), (C)))
+
+#define _mm_scalef_round_sd(A, B, C)                                   \
+  ((__m128d)                                                           \
+   __builtin_ia32_scalefsd_mask_round ((A), (B),                       \
+                                      (__v2df) _mm_undefined_pd (),    \
+                                      -1, (C)))
+
+#define _mm_scalef_round_ss(A, B, C)                                   \
+  ((__m128)                                                            \
+   __builtin_ia32_scalefss_mask_round ((A), (B),                       \
+                                      (__v4sf) _mm_undefined_ps (),    \
+                                      -1, (C)))
+
+#define _mm_mask_scalef_round_sd(W, U, A, B, C)                                \
+  ((__m128d)                                                           \
+   __builtin_ia32_scalefsd_mask_round ((A), (B), (W), (U), (C)))
+
+#define _mm_mask_scalef_round_ss(W, U, A, B, C)                                \
+  ((__m128)                                                            \
+   __builtin_ia32_scalefss_mask_round ((A), (B), (W), (U), (C)))
+
+#define _mm_maskz_scalef_round_sd(U, A, B, C)                          \
+  ((__m128d)                                                           \
+   __builtin_ia32_scalefsd_mask_round ((A), (B),                       \
+                                      (__v2df) _mm_setzero_pd (),      \
+                                      (U), (C)))
+
+#define _mm_maskz_scalef_round_ss(U, A, B, C)                          \
+  ((__m128)                                                            \
+   __builtin_ia32_scalefss_mask_round ((A), (B),                       \
+                                      (__v4sf) _mm_setzero_ps (),      \
+                                      (U), (C)))
+#endif
+
+#define _mm_mask_scalef_sd(W, U, A, B) \
+    _mm_mask_scalef_round_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_scalef_sd(U, A, B) \
+    _mm_maskz_scalef_round_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_scalef_ss(W, U, A, B) \
+    _mm_mask_scalef_round_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_scalef_ss(U, A, B) \
+    _mm_maskz_scalef_round_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmadd_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                   (__v8df) __B,
+                                                   (__v8df) __C,
+                                                   (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmadd_round_pd (__m512d __A, __mmask8 __U, __m512d __B,
+                           __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                   (__v8df) __B,
+                                                   (__v8df) __C,
+                                                   (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmadd_round_pd (__m512d __A, __m512d __B, __m512d __C,
+                            __mmask8 __U, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmadd_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+                            __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmadd_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                  (__v16sf) __B,
+                                                  (__v16sf) __C,
+                                                  (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmadd_round_ps (__m512 __A, __mmask16 __U, __m512 __B,
+                           __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                  (__v16sf) __B,
+                                                  (__v16sf) __C,
+                                                  (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmadd_round_ps (__m512 __A, __m512 __B, __m512 __C,
+                            __mmask16 __U, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmadd_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
+                            __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmsub_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmsubpd512_mask ((__v8df) __A,
+                                                   (__v8df) __B,
+                                                   (__v8df) __C,
+                                                   (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmsub_round_pd (__m512d __A, __mmask8 __U, __m512d __B,
+                           __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmsubpd512_mask ((__v8df) __A,
+                                                   (__v8df) __B,
+                                                   (__v8df) __C,
+                                                   (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmsub_round_pd (__m512d __A, __m512d __B, __m512d __C,
+                            __mmask8 __U, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmsub_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+                            __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmsubpd512_maskz ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmsub_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmsubps512_mask ((__v16sf) __A,
+                                                  (__v16sf) __B,
+                                                  (__v16sf) __C,
+                                                  (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmsub_round_ps (__m512 __A, __mmask16 __U, __m512 __B,
+                           __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmsubps512_mask ((__v16sf) __A,
+                                                  (__v16sf) __B,
+                                                  (__v16sf) __C,
+                                                  (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmsub_round_ps (__m512 __A, __m512 __B, __m512 __C,
+                            __mmask16 __U, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmsub_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
+                            __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmsubps512_maskz ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmaddsub_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                      (__v8df) __B,
+                                                      (__v8df) __C,
+                                                      (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmaddsub_round_pd (__m512d __A, __mmask8 __U, __m512d __B,
+                              __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                      (__v8df) __B,
+                                                      (__v8df) __C,
+                                                      (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmaddsub_round_pd (__m512d __A, __m512d __B, __m512d __C,
+                               __mmask8 __U, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       (__v8df) __C,
+                                                       (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmaddsub_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+                               __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       (__v8df) __C,
+                                                       (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmaddsub_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+                                                     (__v16sf) __B,
+                                                     (__v16sf) __C,
+                                                     (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmaddsub_round_ps (__m512 __A, __mmask16 __U, __m512 __B,
+                              __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+                                                     (__v16sf) __B,
+                                                     (__v16sf) __C,
+                                                     (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmaddsub_round_ps (__m512 __A, __m512 __B, __m512 __C,
+                               __mmask16 __U, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      (__v16sf) __C,
+                                                      (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmaddsub_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
+                               __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      (__v16sf) __C,
+                                                      (__mmask16) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmsubadd_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                      (__v8df) __B,
+                                                      -(__v8df) __C,
+                                                      (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmsubadd_round_pd (__m512d __A, __mmask8 __U, __m512d __B,
+                              __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                      (__v8df) __B,
+                                                      -(__v8df) __C,
+                                                      (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmsubadd_round_pd (__m512d __A, __m512d __B, __m512d __C,
+                               __mmask8 __U, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       (__v8df) __C,
+                                                       (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmsubadd_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+                               __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       -(__v8df) __C,
+                                                       (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmsubadd_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+                                                     (__v16sf) __B,
+                                                     -(__v16sf) __C,
+                                                     (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmsubadd_round_ps (__m512 __A, __mmask16 __U, __m512 __B,
+                              __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+                                                     (__v16sf) __B,
+                                                     -(__v16sf) __C,
+                                                     (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmsubadd_round_ps (__m512 __A, __m512 __B, __m512 __C,
+                               __mmask16 __U, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      (__v16sf) __C,
+                                                      (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmsubadd_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
+                               __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      -(__v16sf) __C,
+                                                      (__mmask16) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fnmadd_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fnmadd_round_pd (__m512d __A, __mmask8 __U, __m512d __B,
+                            __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fnmadd_round_pd (__m512d __A, __m512d __B, __m512d __C,
+                             __mmask8 __U, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfnmaddpd512_mask3 ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fnmadd_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+                             __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfnmaddpd512_maskz ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fnmadd_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fnmadd_round_ps (__m512 __A, __mmask16 __U, __m512 __B,
+                            __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fnmadd_round_ps (__m512 __A, __m512 __B, __m512 __C,
+                             __mmask16 __U, const int __R)
+{
+  return (__m512) __builtin_ia32_vfnmaddps512_mask3 ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fnmadd_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
+                             __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfnmaddps512_maskz ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fnmsub_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fnmsub_round_pd (__m512d __A, __mmask8 __U, __m512d __B,
+                            __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fnmsub_round_pd (__m512d __A, __m512d __B, __m512d __C,
+                             __mmask8 __U, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfnmsubpd512_mask3 ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fnmsub_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+                             __m512d __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_vfnmsubpd512_maskz ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fnmsub_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fnmsub_round_ps (__m512 __A, __mmask16 __U, __m512 __B,
+                            __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fnmsub_round_ps (__m512 __A, __m512 __B, __m512 __C,
+                             __mmask16 __U, const int __R)
+{
+  return (__m512) __builtin_ia32_vfnmsubps512_mask3 ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fnmsub_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
+                             __m512 __C, const int __R)
+{
+  return (__m512) __builtin_ia32_vfnmsubps512_maskz ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U, __R);
+}
+#else
+#define _mm512_fmadd_round_pd(A, B, C, R)            \
+    (__m512d)__builtin_ia32_vfmaddpd512_mask(A, B, C, -1, R)
+
+#define _mm512_mask_fmadd_round_pd(A, U, B, C, R)    \
+    (__m512d)__builtin_ia32_vfmaddpd512_mask(A, B, C, U, R)
+
+#define _mm512_mask3_fmadd_round_pd(A, B, C, U, R)   \
+    (__m512d)__builtin_ia32_vfmaddpd512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fmadd_round_pd(U, A, B, C, R)   \
+    (__m512d)__builtin_ia32_vfmaddpd512_maskz(A, B, C, U, R)
+
+#define _mm512_fmadd_round_ps(A, B, C, R)            \
+    (__m512)__builtin_ia32_vfmaddps512_mask(A, B, C, -1, R)
+
+#define _mm512_mask_fmadd_round_ps(A, U, B, C, R)    \
+    (__m512)__builtin_ia32_vfmaddps512_mask(A, B, C, U, R)
+
+#define _mm512_mask3_fmadd_round_ps(A, B, C, U, R)   \
+    (__m512)__builtin_ia32_vfmaddps512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fmadd_round_ps(U, A, B, C, R)   \
+    (__m512)__builtin_ia32_vfmaddps512_maskz(A, B, C, U, R)
+
+#define _mm512_fmsub_round_pd(A, B, C, R)            \
+    (__m512d)__builtin_ia32_vfmsubpd512_mask(A, B, C, -1, R)
+
+#define _mm512_mask_fmsub_round_pd(A, U, B, C, R)    \
+    (__m512d)__builtin_ia32_vfmsubpd512_mask(A, B, C, U, R)
+
+#define _mm512_mask3_fmsub_round_pd(A, B, C, U, R)   \
+    (__m512d)__builtin_ia32_vfmsubpd512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fmsub_round_pd(U, A, B, C, R)   \
+    (__m512d)__builtin_ia32_vfmsubpd512_maskz(A, B, C, U, R)
+
+#define _mm512_fmsub_round_ps(A, B, C, R)            \
+    (__m512)__builtin_ia32_vfmsubps512_mask(A, B, C, -1, R)
+
+#define _mm512_mask_fmsub_round_ps(A, U, B, C, R)    \
+    (__m512)__builtin_ia32_vfmsubps512_mask(A, B, C, U, R)
+
+#define _mm512_mask3_fmsub_round_ps(A, B, C, U, R)   \
+    (__m512)__builtin_ia32_vfmsubps512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fmsub_round_ps(U, A, B, C, R)   \
+    (__m512)__builtin_ia32_vfmsubps512_maskz(A, B, C, U, R)
+
+#define _mm512_fmaddsub_round_pd(A, B, C, R)            \
+    (__m512d)__builtin_ia32_vfmaddsubpd512_mask(A, B, C, -1, R)
+
+#define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R)    \
+    (__m512d)__builtin_ia32_vfmaddsubpd512_mask(A, B, C, U, R)
+
+#define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R)   \
+    (__m512d)__builtin_ia32_vfmaddsubpd512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R)   \
+    (__m512d)__builtin_ia32_vfmaddsubpd512_maskz(A, B, C, U, R)
+
+#define _mm512_fmaddsub_round_ps(A, B, C, R)            \
+    (__m512)__builtin_ia32_vfmaddsubps512_mask(A, B, C, -1, R)
+
+#define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R)    \
+    (__m512)__builtin_ia32_vfmaddsubps512_mask(A, B, C, U, R)
+
+#define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R)   \
+    (__m512)__builtin_ia32_vfmaddsubps512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R)   \
+    (__m512)__builtin_ia32_vfmaddsubps512_maskz(A, B, C, U, R)
+
+#define _mm512_fmsubadd_round_pd(A, B, C, R)            \
+    (__m512d)__builtin_ia32_vfmaddsubpd512_mask(A, B, -(C), -1, R)
+
+#define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R)    \
+    (__m512d)__builtin_ia32_vfmaddsubpd512_mask(A, B, -(C), U, R)
+
+#define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R)   \
+    (__m512d)__builtin_ia32_vfmsubaddpd512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R)   \
+    (__m512d)__builtin_ia32_vfmaddsubpd512_maskz(A, B, -(C), U, R)
+
+#define _mm512_fmsubadd_round_ps(A, B, C, R)            \
+    (__m512)__builtin_ia32_vfmaddsubps512_mask(A, B, -(C), -1, R)
+
+#define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R)    \
+    (__m512)__builtin_ia32_vfmaddsubps512_mask(A, B, -(C), U, R)
+
+#define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R)   \
+    (__m512)__builtin_ia32_vfmsubaddps512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R)   \
+    (__m512)__builtin_ia32_vfmaddsubps512_maskz(A, B, -(C), U, R)
+
+#define _mm512_fnmadd_round_pd(A, B, C, R)            \
+    (__m512d)__builtin_ia32_vfnmaddpd512_mask(A, B, C, -1, R)
+
+#define _mm512_mask_fnmadd_round_pd(A, U, B, C, R)    \
+    (__m512d)__builtin_ia32_vfnmaddpd512_mask(A, B, C, U, R)
+
+#define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R)   \
+    (__m512d)__builtin_ia32_vfnmaddpd512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R)   \
+    (__m512d)__builtin_ia32_vfnmaddpd512_maskz(A, B, C, U, R)
+
+#define _mm512_fnmadd_round_ps(A, B, C, R)            \
+    (__m512)__builtin_ia32_vfnmaddps512_mask(A, B, C, -1, R)
+
+#define _mm512_mask_fnmadd_round_ps(A, U, B, C, R)    \
+    (__m512)__builtin_ia32_vfnmaddps512_mask(A, B, C, U, R)
+
+#define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R)   \
+    (__m512)__builtin_ia32_vfnmaddps512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R)   \
+    (__m512)__builtin_ia32_vfnmaddps512_maskz(A, B, C, U, R)
+
+#define _mm512_fnmsub_round_pd(A, B, C, R)            \
+    (__m512d)__builtin_ia32_vfnmsubpd512_mask(A, B, C, -1, R)
+
+#define _mm512_mask_fnmsub_round_pd(A, U, B, C, R)    \
+    (__m512d)__builtin_ia32_vfnmsubpd512_mask(A, B, C, U, R)
+
+#define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R)   \
+    (__m512d)__builtin_ia32_vfnmsubpd512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R)   \
+    (__m512d)__builtin_ia32_vfnmsubpd512_maskz(A, B, C, U, R)
+
+#define _mm512_fnmsub_round_ps(A, B, C, R)            \
+    (__m512)__builtin_ia32_vfnmsubps512_mask(A, B, C, -1, R)
+
+#define _mm512_mask_fnmsub_round_ps(A, U, B, C, R)    \
+    (__m512)__builtin_ia32_vfnmsubps512_mask(A, B, C, U, R)
+
+#define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R)   \
+    (__m512)__builtin_ia32_vfnmsubps512_mask3(A, B, C, U, R)
+
+#define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R)   \
+    (__m512)__builtin_ia32_vfnmsubps512_maskz(A, B, C, U, R)
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_abs_epi64 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
+                                                (__v8di)
+                                                _mm512_undefined_epi32 (),
+                                                (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
+                                                (__v8di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
+                                                (__v8di)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_abs_epi32 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
+                                                (__v16si)
+                                                _mm512_undefined_epi32 (),
+                                                (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
+                                                (__v16si) __W,
+                                                (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
+                                                (__v16si)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcastss_ps (__m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastss512 ((__v4sf) __A,
+                                                (__v16sf)
+                                                _mm512_undefined_ps (),
+                                                (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastss512 ((__v4sf) __A,
+                                                (__v16sf) __O, __M);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastss512 ((__v4sf) __A,
+                                                (__v16sf)
+                                                _mm512_setzero_ps (),
+                                                __M);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcastsd_pd (__m128d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastsd512 ((__v2df) __A,
+                                                 (__v8df)
+                                                 _mm512_undefined_pd (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastsd512 ((__v2df) __A,
+                                                 (__v8df) __O, __M);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastsd512 ((__v2df) __A,
+                                                 (__v8df)
+                                                 _mm512_setzero_pd (),
+                                                 __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcastd_epi32 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastd512 ((__v4si) __A,
+                                                 (__v16si)
+                                                 _mm512_undefined_epi32 (),
+                                                 (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastd512 ((__v4si) __A,
+                                                 (__v16si) __O, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastd512 ((__v4si) __A,
+                                                 (__v16si)
+                                                 _mm512_setzero_si512 (),
+                                                 __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set1_epi32 (int __A)
+{
+  return (__m512i)(__v16si)
+    { __A, __A, __A, __A, __A, __A, __A, __A,
+      __A, __A, __A, __A, __A, __A, __A, __A };
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A, (__v16si) __O,
+                                                          __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_set1_epi32 (__mmask16 __M, int __A)
+{
+  return (__m512i)
+        __builtin_ia32_pbroadcastd512_gpr_mask (__A,
+                                                (__v16si) _mm512_setzero_si512 (),
+                                                __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcastq_epi64 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastq512 ((__v2di) __A,
+                                                 (__v8di)
+                                                 _mm512_undefined_epi32 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastq512 ((__v2di) __A,
+                                                 (__v8di) __O, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastq512 ((__v2di) __A,
+                                                 (__v8di)
+                                                 _mm512_setzero_si512 (),
+                                                 __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set1_epi64 (long long __A)
+{
+  return (__m512i)(__v8di) { __A, __A, __A, __A, __A, __A, __A, __A };
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
+{
+  return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A, (__v8di) __O,
+                                                          __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_set1_epi64 (__mmask8 __M, long long __A)
+{
+  return (__m512i)
+        __builtin_ia32_pbroadcastq512_gpr_mask (__A,
+                                                (__v8di) _mm512_setzero_si512 (),
+                                                __M);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcast_f32x4 (__m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
+                                                    (__v16sf)
+                                                    _mm512_undefined_ps (),
+                                                    (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcast_f32x4 (__m512 __O, __mmask16 __M, __m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
+                                                    (__v16sf) __O,
+                                                    __M);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcast_f32x4 (__mmask16 __M, __m128 __A)
+{
+  return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A,
+                                                    (__v16sf)
+                                                    _mm512_setzero_ps (),
+                                                    __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcast_i32x4 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
+                                                     (__v16si)
+                                                     _mm512_undefined_epi32 (),
+                                                     (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcast_i32x4 (__m512i __O, __mmask16 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
+                                                     (__v16si) __O,
+                                                     __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcast_i32x4 (__mmask16 __M, __m128i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A,
+                                                     (__v16si)
+                                                     _mm512_setzero_si512 (),
+                                                     __M);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcast_f64x4 (__m256d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
+                                                     (__v8df)
+                                                     _mm512_undefined_pd (),
+                                                     (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcast_f64x4 (__m512d __O, __mmask8 __M, __m256d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
+                                                     (__v8df) __O,
+                                                     __M);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcast_f64x4 (__mmask8 __M, __m256d __A)
+{
+  return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A,
+                                                     (__v8df)
+                                                     _mm512_setzero_pd (),
+                                                     __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_broadcast_i64x4 (__m256i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
+                                                     (__v8di)
+                                                     _mm512_undefined_epi32 (),
+                                                     (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_broadcast_i64x4 (__m512i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
+                                                     (__v8di) __O,
+                                                     __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_broadcast_i64x4 (__mmask8 __M, __m256i __A)
+{
+  return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A,
+                                                     (__v8di)
+                                                     _mm512_setzero_si512 (),
+                                                     __M);
+}
+
+typedef enum
+{
+  _MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02,
+  _MM_PERM_AAAD = 0x03, _MM_PERM_AABA = 0x04, _MM_PERM_AABB = 0x05,
+  _MM_PERM_AABC = 0x06, _MM_PERM_AABD = 0x07, _MM_PERM_AACA = 0x08,
+  _MM_PERM_AACB = 0x09, _MM_PERM_AACC = 0x0A, _MM_PERM_AACD = 0x0B,
+  _MM_PERM_AADA = 0x0C, _MM_PERM_AADB = 0x0D, _MM_PERM_AADC = 0x0E,
+  _MM_PERM_AADD = 0x0F, _MM_PERM_ABAA = 0x10, _MM_PERM_ABAB = 0x11,
+  _MM_PERM_ABAC = 0x12, _MM_PERM_ABAD = 0x13, _MM_PERM_ABBA = 0x14,
+  _MM_PERM_ABBB = 0x15, _MM_PERM_ABBC = 0x16, _MM_PERM_ABBD = 0x17,
+  _MM_PERM_ABCA = 0x18, _MM_PERM_ABCB = 0x19, _MM_PERM_ABCC = 0x1A,
+  _MM_PERM_ABCD = 0x1B, _MM_PERM_ABDA = 0x1C, _MM_PERM_ABDB = 0x1D,
+  _MM_PERM_ABDC = 0x1E, _MM_PERM_ABDD = 0x1F, _MM_PERM_ACAA = 0x20,
+  _MM_PERM_ACAB = 0x21, _MM_PERM_ACAC = 0x22, _MM_PERM_ACAD = 0x23,
+  _MM_PERM_ACBA = 0x24, _MM_PERM_ACBB = 0x25, _MM_PERM_ACBC = 0x26,
+  _MM_PERM_ACBD = 0x27, _MM_PERM_ACCA = 0x28, _MM_PERM_ACCB = 0x29,
+  _MM_PERM_ACCC = 0x2A, _MM_PERM_ACCD = 0x2B, _MM_PERM_ACDA = 0x2C,
+  _MM_PERM_ACDB = 0x2D, _MM_PERM_ACDC = 0x2E, _MM_PERM_ACDD = 0x2F,
+  _MM_PERM_ADAA = 0x30, _MM_PERM_ADAB = 0x31, _MM_PERM_ADAC = 0x32,
+  _MM_PERM_ADAD = 0x33, _MM_PERM_ADBA = 0x34, _MM_PERM_ADBB = 0x35,
+  _MM_PERM_ADBC = 0x36, _MM_PERM_ADBD = 0x37, _MM_PERM_ADCA = 0x38,
+  _MM_PERM_ADCB = 0x39, _MM_PERM_ADCC = 0x3A, _MM_PERM_ADCD = 0x3B,
+  _MM_PERM_ADDA = 0x3C, _MM_PERM_ADDB = 0x3D, _MM_PERM_ADDC = 0x3E,
+  _MM_PERM_ADDD = 0x3F, _MM_PERM_BAAA = 0x40, _MM_PERM_BAAB = 0x41,
+  _MM_PERM_BAAC = 0x42, _MM_PERM_BAAD = 0x43, _MM_PERM_BABA = 0x44,
+  _MM_PERM_BABB = 0x45, _MM_PERM_BABC = 0x46, _MM_PERM_BABD = 0x47,
+  _MM_PERM_BACA = 0x48, _MM_PERM_BACB = 0x49, _MM_PERM_BACC = 0x4A,
+  _MM_PERM_BACD = 0x4B, _MM_PERM_BADA = 0x4C, _MM_PERM_BADB = 0x4D,
+  _MM_PERM_BADC = 0x4E, _MM_PERM_BADD = 0x4F, _MM_PERM_BBAA = 0x50,
+  _MM_PERM_BBAB = 0x51, _MM_PERM_BBAC = 0x52, _MM_PERM_BBAD = 0x53,
+  _MM_PERM_BBBA = 0x54, _MM_PERM_BBBB = 0x55, _MM_PERM_BBBC = 0x56,
+  _MM_PERM_BBBD = 0x57, _MM_PERM_BBCA = 0x58, _MM_PERM_BBCB = 0x59,
+  _MM_PERM_BBCC = 0x5A, _MM_PERM_BBCD = 0x5B, _MM_PERM_BBDA = 0x5C,
+  _MM_PERM_BBDB = 0x5D, _MM_PERM_BBDC = 0x5E, _MM_PERM_BBDD = 0x5F,
+  _MM_PERM_BCAA = 0x60, _MM_PERM_BCAB = 0x61, _MM_PERM_BCAC = 0x62,
+  _MM_PERM_BCAD = 0x63, _MM_PERM_BCBA = 0x64, _MM_PERM_BCBB = 0x65,
+  _MM_PERM_BCBC = 0x66, _MM_PERM_BCBD = 0x67, _MM_PERM_BCCA = 0x68,
+  _MM_PERM_BCCB = 0x69, _MM_PERM_BCCC = 0x6A, _MM_PERM_BCCD = 0x6B,
+  _MM_PERM_BCDA = 0x6C, _MM_PERM_BCDB = 0x6D, _MM_PERM_BCDC = 0x6E,
+  _MM_PERM_BCDD = 0x6F, _MM_PERM_BDAA = 0x70, _MM_PERM_BDAB = 0x71,
+  _MM_PERM_BDAC = 0x72, _MM_PERM_BDAD = 0x73, _MM_PERM_BDBA = 0x74,
+  _MM_PERM_BDBB = 0x75, _MM_PERM_BDBC = 0x76, _MM_PERM_BDBD = 0x77,
+  _MM_PERM_BDCA = 0x78, _MM_PERM_BDCB = 0x79, _MM_PERM_BDCC = 0x7A,
+  _MM_PERM_BDCD = 0x7B, _MM_PERM_BDDA = 0x7C, _MM_PERM_BDDB = 0x7D,
+  _MM_PERM_BDDC = 0x7E, _MM_PERM_BDDD = 0x7F, _MM_PERM_CAAA = 0x80,
+  _MM_PERM_CAAB = 0x81, _MM_PERM_CAAC = 0x82, _MM_PERM_CAAD = 0x83,
+  _MM_PERM_CABA = 0x84, _MM_PERM_CABB = 0x85, _MM_PERM_CABC = 0x86,
+  _MM_PERM_CABD = 0x87, _MM_PERM_CACA = 0x88, _MM_PERM_CACB = 0x89,
+  _MM_PERM_CACC = 0x8A, _MM_PERM_CACD = 0x8B, _MM_PERM_CADA = 0x8C,
+  _MM_PERM_CADB = 0x8D, _MM_PERM_CADC = 0x8E, _MM_PERM_CADD = 0x8F,
+  _MM_PERM_CBAA = 0x90, _MM_PERM_CBAB = 0x91, _MM_PERM_CBAC = 0x92,
+  _MM_PERM_CBAD = 0x93, _MM_PERM_CBBA = 0x94, _MM_PERM_CBBB = 0x95,
+  _MM_PERM_CBBC = 0x96, _MM_PERM_CBBD = 0x97, _MM_PERM_CBCA = 0x98,
+  _MM_PERM_CBCB = 0x99, _MM_PERM_CBCC = 0x9A, _MM_PERM_CBCD = 0x9B,
+  _MM_PERM_CBDA = 0x9C, _MM_PERM_CBDB = 0x9D, _MM_PERM_CBDC = 0x9E,
+  _MM_PERM_CBDD = 0x9F, _MM_PERM_CCAA = 0xA0, _MM_PERM_CCAB = 0xA1,
+  _MM_PERM_CCAC = 0xA2, _MM_PERM_CCAD = 0xA3, _MM_PERM_CCBA = 0xA4,
+  _MM_PERM_CCBB = 0xA5, _MM_PERM_CCBC = 0xA6, _MM_PERM_CCBD = 0xA7,
+  _MM_PERM_CCCA = 0xA8, _MM_PERM_CCCB = 0xA9, _MM_PERM_CCCC = 0xAA,
+  _MM_PERM_CCCD = 0xAB, _MM_PERM_CCDA = 0xAC, _MM_PERM_CCDB = 0xAD,
+  _MM_PERM_CCDC = 0xAE, _MM_PERM_CCDD = 0xAF, _MM_PERM_CDAA = 0xB0,
+  _MM_PERM_CDAB = 0xB1, _MM_PERM_CDAC = 0xB2, _MM_PERM_CDAD = 0xB3,
+  _MM_PERM_CDBA = 0xB4, _MM_PERM_CDBB = 0xB5, _MM_PERM_CDBC = 0xB6,
+  _MM_PERM_CDBD = 0xB7, _MM_PERM_CDCA = 0xB8, _MM_PERM_CDCB = 0xB9,
+  _MM_PERM_CDCC = 0xBA, _MM_PERM_CDCD = 0xBB, _MM_PERM_CDDA = 0xBC,
+  _MM_PERM_CDDB = 0xBD, _MM_PERM_CDDC = 0xBE, _MM_PERM_CDDD = 0xBF,
+  _MM_PERM_DAAA = 0xC0, _MM_PERM_DAAB = 0xC1, _MM_PERM_DAAC = 0xC2,
+  _MM_PERM_DAAD = 0xC3, _MM_PERM_DABA = 0xC4, _MM_PERM_DABB = 0xC5,
+  _MM_PERM_DABC = 0xC6, _MM_PERM_DABD = 0xC7, _MM_PERM_DACA = 0xC8,
+  _MM_PERM_DACB = 0xC9, _MM_PERM_DACC = 0xCA, _MM_PERM_DACD = 0xCB,
+  _MM_PERM_DADA = 0xCC, _MM_PERM_DADB = 0xCD, _MM_PERM_DADC = 0xCE,
+  _MM_PERM_DADD = 0xCF, _MM_PERM_DBAA = 0xD0, _MM_PERM_DBAB = 0xD1,
+  _MM_PERM_DBAC = 0xD2, _MM_PERM_DBAD = 0xD3, _MM_PERM_DBBA = 0xD4,
+  _MM_PERM_DBBB = 0xD5, _MM_PERM_DBBC = 0xD6, _MM_PERM_DBBD = 0xD7,
+  _MM_PERM_DBCA = 0xD8, _MM_PERM_DBCB = 0xD9, _MM_PERM_DBCC = 0xDA,
+  _MM_PERM_DBCD = 0xDB, _MM_PERM_DBDA = 0xDC, _MM_PERM_DBDB = 0xDD,
+  _MM_PERM_DBDC = 0xDE, _MM_PERM_DBDD = 0xDF, _MM_PERM_DCAA = 0xE0,
+  _MM_PERM_DCAB = 0xE1, _MM_PERM_DCAC = 0xE2, _MM_PERM_DCAD = 0xE3,
+  _MM_PERM_DCBA = 0xE4, _MM_PERM_DCBB = 0xE5, _MM_PERM_DCBC = 0xE6,
+  _MM_PERM_DCBD = 0xE7, _MM_PERM_DCCA = 0xE8, _MM_PERM_DCCB = 0xE9,
+  _MM_PERM_DCCC = 0xEA, _MM_PERM_DCCD = 0xEB, _MM_PERM_DCDA = 0xEC,
+  _MM_PERM_DCDB = 0xED, _MM_PERM_DCDC = 0xEE, _MM_PERM_DCDD = 0xEF,
+  _MM_PERM_DDAA = 0xF0, _MM_PERM_DDAB = 0xF1, _MM_PERM_DDAC = 0xF2,
+  _MM_PERM_DDAD = 0xF3, _MM_PERM_DDBA = 0xF4, _MM_PERM_DDBB = 0xF5,
+  _MM_PERM_DDBC = 0xF6, _MM_PERM_DDBD = 0xF7, _MM_PERM_DDCA = 0xF8,
+  _MM_PERM_DDCB = 0xF9, _MM_PERM_DDCC = 0xFA, _MM_PERM_DDCD = 0xFB,
+  _MM_PERM_DDDA = 0xFC, _MM_PERM_DDDB = 0xFD, _MM_PERM_DDDC = 0xFE,
+  _MM_PERM_DDDD = 0xFF
+} _MM_PERM_ENUM;
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shuffle_epi32 (__m512i __A, _MM_PERM_ENUM __mask)
+{
+  return (__m512i) __builtin_ia32_pshufd512_mask ((__v16si) __A,
+                                                 __mask,
+                                                 (__v16si)
+                                                 _mm512_undefined_epi32 (),
+                                                 (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shuffle_epi32 (__m512i __W, __mmask16 __U, __m512i __A,
+                          _MM_PERM_ENUM __mask)
+{
+  return (__m512i) __builtin_ia32_pshufd512_mask ((__v16si) __A,
+                                                 __mask,
+                                                 (__v16si) __W,
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shuffle_epi32 (__mmask16 __U, __m512i __A, _MM_PERM_ENUM __mask)
+{
+  return (__m512i) __builtin_ia32_pshufd512_mask ((__v16si) __A,
+                                                 __mask,
+                                                 (__v16si)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shuffle_i64x2 (__m512i __A, __m512i __B, const int __imm)
+{
+  return (__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di) __A,
+                                                  (__v8di) __B, __imm,
+                                                  (__v8di)
+                                                  _mm512_undefined_epi32 (),
+                                                  (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shuffle_i64x2 (__m512i __W, __mmask8 __U, __m512i __A,
+                          __m512i __B, const int __imm)
+{
+  return (__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di) __A,
+                                                  (__v8di) __B, __imm,
+                                                  (__v8di) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shuffle_i64x2 (__mmask8 __U, __m512i __A, __m512i __B,
+                           const int __imm)
+{
+  return (__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di) __A,
+                                                  (__v8di) __B, __imm,
+                                                  (__v8di)
+                                                  _mm512_setzero_si512 (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shuffle_i32x4 (__m512i __A, __m512i __B, const int __imm)
+{
+  return (__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si) __A,
+                                                  (__v16si) __B,
+                                                  __imm,
+                                                  (__v16si)
+                                                  _mm512_undefined_epi32 (),
+                                                  (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shuffle_i32x4 (__m512i __W, __mmask16 __U, __m512i __A,
+                          __m512i __B, const int __imm)
+{
+  return (__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si) __A,
+                                                  (__v16si) __B,
+                                                  __imm,
+                                                  (__v16si) __W,
+                                                  (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shuffle_i32x4 (__mmask16 __U, __m512i __A, __m512i __B,
+                           const int __imm)
+{
+  return (__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si) __A,
+                                                  (__v16si) __B,
+                                                  __imm,
+                                                  (__v16si)
+                                                  _mm512_setzero_si512 (),
+                                                  (__mmask16) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shuffle_f64x2 (__m512d __A, __m512d __B, const int __imm)
+{
+  return (__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df) __A,
+                                                  (__v8df) __B, __imm,
+                                                  (__v8df)
+                                                  _mm512_undefined_pd (),
+                                                  (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shuffle_f64x2 (__m512d __W, __mmask8 __U, __m512d __A,
+                          __m512d __B, const int __imm)
+{
+  return (__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df) __A,
+                                                  (__v8df) __B, __imm,
+                                                  (__v8df) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shuffle_f64x2 (__mmask8 __U, __m512d __A, __m512d __B,
+                           const int __imm)
+{
+  return (__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df) __A,
+                                                  (__v8df) __B, __imm,
+                                                  (__v8df)
+                                                  _mm512_setzero_pd (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shuffle_f32x4 (__m512 __A, __m512 __B, const int __imm)
+{
+  return (__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf) __A,
+                                                 (__v16sf) __B, __imm,
+                                                 (__v16sf)
+                                                 _mm512_undefined_ps (),
+                                                 (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shuffle_f32x4 (__m512 __W, __mmask16 __U, __m512 __A,
+                          __m512 __B, const int __imm)
+{
+  return (__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf) __A,
+                                                 (__v16sf) __B, __imm,
+                                                 (__v16sf) __W,
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shuffle_f32x4 (__mmask16 __U, __m512 __A, __m512 __B,
+                           const int __imm)
+{
+  return (__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf) __A,
+                                                 (__v16sf) __B, __imm,
+                                                 (__v16sf)
+                                                 _mm512_setzero_ps (),
+                                                 (__mmask16) __U);
+}
+
+#else
+#define _mm512_shuffle_epi32(X, C)                                      \
+  ((__m512i)  __builtin_ia32_pshufd512_mask ((__v16si)(__m512i)(X), (int)(C),\
+    (__v16si)(__m512i)_mm512_undefined_epi32 (),\
+    (__mmask16)-1))
+
+#define _mm512_mask_shuffle_epi32(W, U, X, C)                           \
+  ((__m512i)  __builtin_ia32_pshufd512_mask ((__v16si)(__m512i)(X), (int)(C),\
+    (__v16si)(__m512i)(W),\
+    (__mmask16)(U)))
+
+#define _mm512_maskz_shuffle_epi32(U, X, C)                             \
+  ((__m512i)  __builtin_ia32_pshufd512_mask ((__v16si)(__m512i)(X), (int)(C),\
+    (__v16si)(__m512i)_mm512_setzero_si512 (),\
+    (__mmask16)(U)))
+
+#define _mm512_shuffle_i64x2(X, Y, C)                                   \
+  ((__m512i)  __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(X),     \
+      (__v8di)(__m512i)(Y), (int)(C),\
+    (__v8di)(__m512i)_mm512_undefined_epi32 (),\
+    (__mmask8)-1))
+
+#define _mm512_mask_shuffle_i64x2(W, U, X, Y, C)                        \
+  ((__m512i)  __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(X),     \
+      (__v8di)(__m512i)(Y), (int)(C),\
+    (__v8di)(__m512i)(W),\
+    (__mmask8)(U)))
+
+#define _mm512_maskz_shuffle_i64x2(U, X, Y, C)                          \
+  ((__m512i)  __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(X),     \
+      (__v8di)(__m512i)(Y), (int)(C),\
+    (__v8di)(__m512i)_mm512_setzero_si512 (),\
+    (__mmask8)(U)))
+
+#define _mm512_shuffle_i32x4(X, Y, C)                                   \
+  ((__m512i)  __builtin_ia32_shuf_i32x4_mask ((__v16si)(__m512i)(X),    \
+      (__v16si)(__m512i)(Y), (int)(C),\
+    (__v16si)(__m512i)_mm512_undefined_epi32 (),\
+    (__mmask16)-1))
+
+#define _mm512_mask_shuffle_i32x4(W, U, X, Y, C)                        \
+  ((__m512i)  __builtin_ia32_shuf_i32x4_mask ((__v16si)(__m512i)(X),    \
+      (__v16si)(__m512i)(Y), (int)(C),\
+    (__v16si)(__m512i)(W),\
+    (__mmask16)(U)))
+
+#define _mm512_maskz_shuffle_i32x4(U, X, Y, C)                          \
+  ((__m512i)  __builtin_ia32_shuf_i32x4_mask ((__v16si)(__m512i)(X),    \
+      (__v16si)(__m512i)(Y), (int)(C),\
+    (__v16si)(__m512i)_mm512_setzero_si512 (),\
+    (__mmask16)(U)))
+
+#define _mm512_shuffle_f64x2(X, Y, C)                                   \
+  ((__m512d)  __builtin_ia32_shuf_f64x2_mask ((__v8df)(__m512d)(X),     \
+      (__v8df)(__m512d)(Y), (int)(C),\
+    (__v8df)(__m512d)_mm512_undefined_pd(),\
+    (__mmask8)-1))
+
+#define _mm512_mask_shuffle_f64x2(W, U, X, Y, C)                        \
+  ((__m512d)  __builtin_ia32_shuf_f64x2_mask ((__v8df)(__m512d)(X),     \
+      (__v8df)(__m512d)(Y), (int)(C),\
+    (__v8df)(__m512d)(W),\
+    (__mmask8)(U)))
+
+#define _mm512_maskz_shuffle_f64x2(U, X, Y, C)                         \
+  ((__m512d)  __builtin_ia32_shuf_f64x2_mask ((__v8df)(__m512d)(X),    \
+      (__v8df)(__m512d)(Y), (int)(C),\
+    (__v8df)(__m512d)_mm512_setzero_pd(),\
+    (__mmask8)(U)))
+
+#define _mm512_shuffle_f32x4(X, Y, C)                                  \
+  ((__m512)  __builtin_ia32_shuf_f32x4_mask ((__v16sf)(__m512)(X),     \
+      (__v16sf)(__m512)(Y), (int)(C),\
+    (__v16sf)(__m512)_mm512_undefined_ps(),\
+    (__mmask16)-1))
+
+#define _mm512_mask_shuffle_f32x4(W, U, X, Y, C)                       \
+  ((__m512)  __builtin_ia32_shuf_f32x4_mask ((__v16sf)(__m512)(X),     \
+      (__v16sf)(__m512)(Y), (int)(C),\
+    (__v16sf)(__m512)(W),\
+    (__mmask16)(U)))
+
+#define _mm512_maskz_shuffle_f32x4(U, X, Y, C)                         \
+  ((__m512)  __builtin_ia32_shuf_f32x4_mask ((__v16sf)(__m512)(X),     \
+      (__v16sf)(__m512)(Y), (int)(C),\
+    (__v16sf)(__m512)_mm512_setzero_ps(),\
+    (__mmask16)(U)))
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rolv_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A,
+                                                 (__v16si) __B,
+                                                 (__v16si)
+                                                 _mm512_undefined_epi32 (),
+                                                 (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rolv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A,
+                                                 (__v16si) __B,
+                                                 (__v16si) __W,
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rolv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A,
+                                                 (__v16si) __B,
+                                                 (__v16si)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rorv_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A,
+                                                 (__v16si) __B,
+                                                 (__v16si)
+                                                 _mm512_undefined_epi32 (),
+                                                 (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rorv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A,
+                                                 (__v16si) __B,
+                                                 (__v16si) __W,
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rorv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A,
+                                                 (__v16si) __B,
+                                                 (__v16si)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rolv_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A,
+                                                 (__v8di) __B,
+                                                 (__v8di)
+                                                 _mm512_undefined_epi32 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rolv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A,
+                                                 (__v8di) __B,
+                                                 (__v8di) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A,
+                                                 (__v8di) __B,
+                                                 (__v8di)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rorv_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A,
+                                                 (__v8di) __B,
+                                                 (__v8di)
+                                                 _mm512_undefined_epi32 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rorv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A,
+                                                 (__v8di) __B,
+                                                 (__v8di) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A,
+                                                 (__v8di) __B,
+                                                 (__v8di)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask8) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundpd_epi32 (__m512d __A, const int __R)
+{
+  return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
+                                                    (__v8si)
+                                                    _mm256_undefined_si256 (),
+                                                    (__mmask8) -1, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A,
+                               const int __R)
+{
+  return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
+                                                    (__v8si) __W,
+                                                    (__mmask8) __U, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundpd_epi32 (__mmask8 __U, __m512d __A, const int __R)
+{
+  return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
+                                                    (__v8si)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask8) __U, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundpd_epu32 (__m512d __A, const int __R)
+{
+  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+                                                     (__v8si)
+                                                     _mm256_undefined_si256 (),
+                                                     (__mmask8) -1, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A,
+                               const int __R)
+{
+  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+                                                     (__v8si) __W,
+                                                     (__mmask8) __U, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundpd_epu32 (__mmask8 __U, __m512d __A, const int __R)
+{
+  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+                                                     (__v8si)
+                                                     _mm256_setzero_si256 (),
+                                                     (__mmask8) __U, __R);
+}
+#else
+#define _mm512_cvtt_roundpd_epi32(A, B)                     \
+    ((__m256i)__builtin_ia32_cvttpd2dq512_mask(A, (__v8si)_mm256_undefined_si256(), -1, B))
+
+#define _mm512_mask_cvtt_roundpd_epi32(W, U, A, B)   \
+    ((__m256i)__builtin_ia32_cvttpd2dq512_mask(A, (__v8si)(W), U, B))
+
+#define _mm512_maskz_cvtt_roundpd_epi32(U, A, B)     \
+    ((__m256i)__builtin_ia32_cvttpd2dq512_mask(A, (__v8si)_mm256_setzero_si256(), U, B))
+
+#define _mm512_cvtt_roundpd_epu32(A, B)                     \
+    ((__m256i)__builtin_ia32_cvttpd2udq512_mask(A, (__v8si)_mm256_undefined_si256(), -1, B))
+
+#define _mm512_mask_cvtt_roundpd_epu32(W, U, A, B)   \
+    ((__m256i)__builtin_ia32_cvttpd2udq512_mask(A, (__v8si)(W), U, B))
+
+#define _mm512_maskz_cvtt_roundpd_epu32(U, A, B)     \
+    ((__m256i)__builtin_ia32_cvttpd2udq512_mask(A, (__v8si)_mm256_setzero_si256(), U, B))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundpd_epi32 (__m512d __A, const int __R)
+{
+  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
+                                                   (__v8si)
+                                                   _mm256_undefined_si256 (),
+                                                   (__mmask8) -1, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A,
+                              const int __R)
+{
+  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
+                                                   (__v8si) __W,
+                                                   (__mmask8) __U, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundpd_epi32 (__mmask8 __U, __m512d __A, const int __R)
+{
+  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
+                                                   (__v8si)
+                                                   _mm256_setzero_si256 (),
+                                                   (__mmask8) __U, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundpd_epu32 (__m512d __A, const int __R)
+{
+  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
+                                                    (__v8si)
+                                                    _mm256_undefined_si256 (),
+                                                    (__mmask8) -1, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A,
+                              const int __R)
+{
+  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
+                                                    (__v8si) __W,
+                                                    (__mmask8) __U, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundpd_epu32 (__mmask8 __U, __m512d __A, const int __R)
+{
+  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
+                                                    (__v8si)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask8) __U, __R);
+}
+#else
+#define _mm512_cvt_roundpd_epi32(A, B)             \
+    ((__m256i)__builtin_ia32_cvtpd2dq512_mask(A, (__v8si)_mm256_undefined_si256(), -1, B))
+
+#define _mm512_mask_cvt_roundpd_epi32(W, U, A, B)   \
+    ((__m256i)__builtin_ia32_cvtpd2dq512_mask(A, (__v8si)(W), U, B))
+
+#define _mm512_maskz_cvt_roundpd_epi32(U, A, B)     \
+    ((__m256i)__builtin_ia32_cvtpd2dq512_mask(A, (__v8si)_mm256_setzero_si256(), U, B))
+
+#define _mm512_cvt_roundpd_epu32(A, B)             \
+    ((__m256i)__builtin_ia32_cvtpd2udq512_mask(A, (__v8si)_mm256_undefined_si256(), -1, B))
+
+#define _mm512_mask_cvt_roundpd_epu32(W, U, A, B)   \
+    ((__m256i)__builtin_ia32_cvtpd2udq512_mask(A, (__v8si)(W), U, B))
+
+#define _mm512_maskz_cvt_roundpd_epu32(U, A, B)     \
+    ((__m256i)__builtin_ia32_cvtpd2udq512_mask(A, (__v8si)_mm256_setzero_si256(), U, B))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundps_epi32 (__m512 __A, const int __R)
+{
+  return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
+                                                    (__v16si)
+                                                    _mm512_undefined_epi32 (),
+                                                    (__mmask16) -1, __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundps_epi32 (__m512i __W, __mmask16 __U, __m512 __A,
+                               const int __R)
+{
+  return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
+                                                    (__v16si) __W,
+                                                    (__mmask16) __U, __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundps_epi32 (__mmask16 __U, __m512 __A, const int __R)
+{
+  return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
+                                                    (__v16si)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask16) __U, __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundps_epu32 (__m512 __A, const int __R)
+{
+  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
+                                                     (__v16si)
+                                                     _mm512_undefined_epi32 (),
+                                                     (__mmask16) -1, __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundps_epu32 (__m512i __W, __mmask16 __U, __m512 __A,
+                               const int __R)
+{
+  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
+                                                     (__v16si) __W,
+                                                     (__mmask16) __U, __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundps_epu32 (__mmask16 __U, __m512 __A, const int __R)
+{
+  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
+                                                     (__v16si)
+                                                     _mm512_setzero_si512 (),
+                                                     (__mmask16) __U, __R);
+}
+#else
+#define _mm512_cvtt_roundps_epi32(A, B)                     \
+    ((__m512i)__builtin_ia32_cvttps2dq512_mask(A, (__v16si)_mm512_undefined_epi32 (), -1, B))
+
+#define _mm512_mask_cvtt_roundps_epi32(W, U, A, B)   \
+    ((__m512i)__builtin_ia32_cvttps2dq512_mask(A, (__v16si)(W), U, B))
+
+#define _mm512_maskz_cvtt_roundps_epi32(U, A, B)     \
+    ((__m512i)__builtin_ia32_cvttps2dq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B))
+
+#define _mm512_cvtt_roundps_epu32(A, B)                     \
+    ((__m512i)__builtin_ia32_cvttps2udq512_mask(A, (__v16si)_mm512_undefined_epi32 (), -1, B))
+
+#define _mm512_mask_cvtt_roundps_epu32(W, U, A, B)   \
+    ((__m512i)__builtin_ia32_cvttps2udq512_mask(A, (__v16si)(W), U, B))
+
+#define _mm512_maskz_cvtt_roundps_epu32(U, A, B)     \
+    ((__m512i)__builtin_ia32_cvttps2udq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundps_epi32 (__m512 __A, const int __R)
+{
+  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
+                                                   (__v16si)
+                                                   _mm512_undefined_epi32 (),
+                                                   (__mmask16) -1, __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundps_epi32 (__m512i __W, __mmask16 __U, __m512 __A,
+                              const int __R)
+{
+  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
+                                                   (__v16si) __W,
+                                                   (__mmask16) __U, __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundps_epi32 (__mmask16 __U, __m512 __A, const int __R)
+{
+  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
+                                                   (__v16si)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask16) __U, __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundps_epu32 (__m512 __A, const int __R)
+{
+  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
+                                                    (__v16si)
+                                                    _mm512_undefined_epi32 (),
+                                                    (__mmask16) -1, __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundps_epu32 (__m512i __W, __mmask16 __U, __m512 __A,
+                              const int __R)
+{
+  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
+                                                    (__v16si) __W,
+                                                    (__mmask16) __U, __R);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundps_epu32 (__mmask16 __U, __m512 __A, const int __R)
+{
+  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
+                                                    (__v16si)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask16) __U, __R);
+}
+#else
+#define _mm512_cvt_roundps_epi32(A, B)             \
+    ((__m512i)__builtin_ia32_cvtps2dq512_mask(A, (__v16si)_mm512_undefined_epi32 (), -1, B))
+
+#define _mm512_mask_cvt_roundps_epi32(W, U, A, B)   \
+    ((__m512i)__builtin_ia32_cvtps2dq512_mask(A, (__v16si)(W), U, B))
+
+#define _mm512_maskz_cvt_roundps_epi32(U, A, B)     \
+    ((__m512i)__builtin_ia32_cvtps2dq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B))
+
+#define _mm512_cvt_roundps_epu32(A, B)             \
+    ((__m512i)__builtin_ia32_cvtps2udq512_mask(A, (__v16si)_mm512_undefined_epi32 (), -1, B))
+
+#define _mm512_mask_cvt_roundps_epu32(W, U, A, B)   \
+    ((__m512i)__builtin_ia32_cvtps2udq512_mask(A, (__v16si)(W), U, B))
+
+#define _mm512_maskz_cvt_roundps_epu32(U, A, B)     \
+    ((__m512i)__builtin_ia32_cvtps2udq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B))
+#endif
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtu32_sd (__m128d __A, unsigned __B)
+{
+  return (__m128d) __builtin_ia32_cvtusi2sd32 ((__v2df) __A, __B);
+}
+
+#ifdef __x86_64__
+#ifdef __OPTIMIZE__
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundu64_sd (__m128d __A, unsigned long long __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundi64_sd (__m128d __A, long long __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_cvtsi2sd64 ((__v2df) __A, __B, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsi64_sd (__m128d __A, long long __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_cvtsi2sd64 ((__v2df) __A, __B, __R);
+}
+#else
+#define _mm_cvt_roundu64_sd(A, B, C)   \
+    (__m128d)__builtin_ia32_cvtusi2sd64(A, B, C)
+
+#define _mm_cvt_roundi64_sd(A, B, C)   \
+    (__m128d)__builtin_ia32_cvtsi2sd64(A, B, C)
+
+#define _mm_cvt_roundsi64_sd(A, B, C)   \
+    (__m128d)__builtin_ia32_cvtsi2sd64(A, B, C)
+#endif
+
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundu32_ss (__m128 __A, unsigned __B, const int __R)
+{
+  return (__m128) __builtin_ia32_cvtusi2ss32 ((__v4sf) __A, __B, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsi32_ss (__m128 __A, int __B, const int __R)
+{
+  return (__m128) __builtin_ia32_cvtsi2ss32 ((__v4sf) __A, __B, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundi32_ss (__m128 __A, int __B, const int __R)
+{
+  return (__m128) __builtin_ia32_cvtsi2ss32 ((__v4sf) __A, __B, __R);
+}
+#else
+#define _mm_cvt_roundu32_ss(A, B, C)   \
+    (__m128)__builtin_ia32_cvtusi2ss32(A, B, C)
+
+#define _mm_cvt_roundi32_ss(A, B, C)   \
+    (__m128)__builtin_ia32_cvtsi2ss32(A, B, C)
+
+#define _mm_cvt_roundsi32_ss(A, B, C)   \
+    (__m128)__builtin_ia32_cvtsi2ss32(A, B, C)
+#endif
+
+#ifdef __x86_64__
+#ifdef __OPTIMIZE__
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundu64_ss (__m128 __A, unsigned long long __B, const int __R)
+{
+  return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsi64_ss (__m128 __A, long long __B, const int __R)
+{
+  return (__m128) __builtin_ia32_cvtsi2ss64 ((__v4sf) __A, __B, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundi64_ss (__m128 __A, long long __B, const int __R)
+{
+  return (__m128) __builtin_ia32_cvtsi2ss64 ((__v4sf) __A, __B, __R);
+}
+#else
+#define _mm_cvt_roundu64_ss(A, B, C)   \
+    (__m128)__builtin_ia32_cvtusi2ss64(A, B, C)
+
+#define _mm_cvt_roundi64_ss(A, B, C)   \
+    (__m128)__builtin_ia32_cvtsi2ss64(A, B, C)
+
+#define _mm_cvt_roundsi64_ss(A, B, C)   \
+    (__m128)__builtin_ia32_cvtsi2ss64(A, B, C)
+#endif
+
+#endif
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi32_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
+                                                 (__v16qi)
+                                                 _mm_undefined_si128 (),
+                                                 (__mmask16) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
+                                                 (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
+                                                 (__v16qi)
+                                                 _mm_setzero_si128 (),
+                                                 __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtsepi32_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
+                                                  (__v16qi)
+                                                  _mm_undefined_si128 (),
+                                                  (__mmask16) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovsdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
+                                                  (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtsepi32_epi8 (__mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
+                                                  (__v16qi)
+                                                  _mm_setzero_si128 (),
+                                                  __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtusepi32_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
+                                                   (__v16qi)
+                                                   _mm_undefined_si128 (),
+                                                   (__mmask16) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovusdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
+                                                   (__v16qi) __O,
+                                                   __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtusepi32_epi8 (__mmask16 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
+                                                   (__v16qi)
+                                                   _mm_setzero_si128 (),
+                                                   __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi32_epi16 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
+                                                 (__v16hi)
+                                                 _mm256_undefined_si256 (),
+                                                 (__mmask16) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi32_storeu_epi16 (void * __P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovdw512mem_mask ((__v16hi *) __P, (__v16si) __A, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
+                                                 (__v16hi) __O, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
+                                                 (__v16hi)
+                                                 _mm256_setzero_si256 (),
+                                                 __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtsepi32_epi16 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
+                                                  (__v16hi)
+                                                  _mm256_undefined_si256 (),
+                                                  (__mmask16) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovsdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
+                                                  (__v16hi) __O, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtsepi32_epi16 (__mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
+                                                  (__v16hi)
+                                                  _mm256_setzero_si256 (),
+                                                  __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtusepi32_epi16 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
+                                                   (__v16hi)
+                                                   _mm256_undefined_si256 (),
+                                                   (__mmask16) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
+{
+  __builtin_ia32_pmovusdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
+                                                   (__v16hi) __O,
+                                                   __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtusepi32_epi16 (__mmask16 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
+                                                   (__v16hi)
+                                                   _mm256_setzero_si256 (),
+                                                   __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi64_epi32 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
+                                                 (__v8si)
+                                                 _mm256_undefined_si256 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
+                                                 (__v8si) __O, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
+                                                 (__v8si)
+                                                 _mm256_setzero_si256 (),
+                                                 __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtsepi64_epi32 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
+                                                  (__v8si)
+                                                  _mm256_undefined_si256 (),
+                                                  (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi64_storeu_epi32 (void *__P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovsqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
+                                                  (__v8si) __O, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtsepi64_epi32 (__mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
+                                                  (__v8si)
+                                                  _mm256_setzero_si256 (),
+                                                  __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtusepi64_epi32 (__m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
+                                                   (__v8si)
+                                                   _mm256_undefined_si256 (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovusqd512mem_mask ((__v8si*) __P, (__v8di) __A, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
+                                                   (__v8si) __O, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtusepi64_epi32 (__mmask8 __M, __m512i __A)
+{
+  return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
+                                                   (__v8si)
+                                                   _mm256_setzero_si256 (),
+                                                   __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi64_epi16 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
+                                                 (__v8hi)
+                                                 _mm_undefined_si128 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
+                                                 (__v8hi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
+                                                 (__v8hi)
+                                                 _mm_setzero_si128 (),
+                                                 __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtsepi64_epi16 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
+                                                  (__v8hi)
+                                                  _mm_undefined_si128 (),
+                                                  (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovsqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
+                                                  (__v8hi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtsepi64_epi16 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
+                                                  (__v8hi)
+                                                  _mm_setzero_si128 (),
+                                                  __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtusepi64_epi16 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
+                                                   (__v8hi)
+                                                   _mm_undefined_si128 (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovusqw512mem_mask ((__v8hi*) __P, (__v8di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
+                                                   (__v8hi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtusepi64_epi16 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
+                                                   (__v8hi)
+                                                   _mm_setzero_si128 (),
+                                                   __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi64_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
+                                                 (__v16qi)
+                                                 _mm_undefined_si128 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovqb512mem_mask ((unsigned long long *) __P,
+                                   (__v8di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
+                                                 (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi64_epi8 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
+                                                 (__v16qi)
+                                                 _mm_setzero_si128 (),
+                                                 __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtsepi64_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
+                                                  (__v16qi)
+                                                  _mm_undefined_si128 (),
+                                                  (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovsqb512mem_mask ((unsigned long long *) __P, (__v8di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
+                                                  (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtsepi64_epi8 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
+                                                  (__v16qi)
+                                                  _mm_setzero_si128 (),
+                                                  __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtusepi64_epi8 (__m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
+                                                   (__v16qi)
+                                                   _mm_undefined_si128 (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
+{
+  __builtin_ia32_pmovusqb512mem_mask ((unsigned long long *) __P, (__v8di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
+                                                   (__v16qi) __O,
+                                                   __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtusepi64_epi8 (__mmask8 __M, __m512i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
+                                                   (__v16qi)
+                                                   _mm_setzero_si128 (),
+                                                   __M);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi32_pd (__m256i __A)
+{
+  return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A,
+                                                   (__v8df)
+                                                   _mm512_undefined_pd (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A)
+{
+  return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A,
+                                                   (__v8df) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A)
+{
+  return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A,
+                                                   (__v8df)
+                                                   _mm512_setzero_pd (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu32_pd (__m256i __A)
+{
+  return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A,
+                                                    (__v8df)
+                                                    _mm512_undefined_pd (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A)
+{
+  return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A,
+                                                    (__v8df) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A)
+{
+  return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A,
+                                                    (__v8df)
+                                                    _mm512_setzero_pd (),
+                                                    (__mmask8) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepi32_ps (__m512i __A, const int __R)
+{
+  return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
+                                                  (__v16sf)
+                                                  _mm512_undefined_ps (),
+                                                  (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepi32_ps (__m512 __W, __mmask16 __U, __m512i __A,
+                              const int __R)
+{
+  return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
+                                                  (__v16sf) __W,
+                                                  (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepi32_ps (__mmask16 __U, __m512i __A, const int __R)
+{
+  return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
+                                                  (__v16sf)
+                                                  _mm512_setzero_ps (),
+                                                  (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepu32_ps (__m512i __A, const int __R)
+{
+  return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
+                                                   (__v16sf)
+                                                   _mm512_undefined_ps (),
+                                                   (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepu32_ps (__m512 __W, __mmask16 __U, __m512i __A,
+                              const int __R)
+{
+  return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
+                                                   (__v16sf) __W,
+                                                   (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepu32_ps (__mmask16 __U, __m512i __A, const int __R)
+{
+  return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
+                                                   (__v16sf)
+                                                   _mm512_setzero_ps (),
+                                                   (__mmask16) __U, __R);
+}
+
+#else
+#define _mm512_cvt_roundepi32_ps(A, B)        \
+    (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(A), (__v16sf)_mm512_undefined_ps(), -1, B)
+
+#define _mm512_mask_cvt_roundepi32_ps(W, U, A, B)   \
+    (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(A), W, U, B)
+
+#define _mm512_maskz_cvt_roundepi32_ps(U, A, B)      \
+    (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(A), (__v16sf)_mm512_setzero_ps(), U, B)
+
+#define _mm512_cvt_roundepu32_ps(A, B)        \
+    (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(A), (__v16sf)_mm512_undefined_ps(), -1, B)
+
+#define _mm512_mask_cvt_roundepu32_ps(W, U, A, B)   \
+    (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(A), W, U, B)
+
+#define _mm512_maskz_cvt_roundepu32_ps(U, A, B)      \
+    (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(A), (__v16sf)_mm512_setzero_ps(), U, B)
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_extractf64x4_pd (__m512d __A, const int __imm)
+{
+  return (__m256d) __builtin_ia32_extractf64x4_mask ((__v8df) __A,
+                                                    __imm,
+                                                    (__v4df)
+                                                    _mm256_undefined_pd (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_extractf64x4_pd (__m256d __W, __mmask8 __U, __m512d __A,
+                            const int __imm)
+{
+  return (__m256d) __builtin_ia32_extractf64x4_mask ((__v8df) __A,
+                                                    __imm,
+                                                    (__v4df) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_extractf64x4_pd (__mmask8 __U, __m512d __A, const int __imm)
+{
+  return (__m256d) __builtin_ia32_extractf64x4_mask ((__v8df) __A,
+                                                    __imm,
+                                                    (__v4df)
+                                                    _mm256_setzero_pd (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_extractf32x4_ps (__m512 __A, const int __imm)
+{
+  return (__m128) __builtin_ia32_extractf32x4_mask ((__v16sf) __A,
+                                                   __imm,
+                                                   (__v4sf)
+                                                   _mm_undefined_ps (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_extractf32x4_ps (__m128 __W, __mmask8 __U, __m512 __A,
+                            const int __imm)
+{
+  return (__m128) __builtin_ia32_extractf32x4_mask ((__v16sf) __A,
+                                                   __imm,
+                                                   (__v4sf) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_extractf32x4_ps (__mmask8 __U, __m512 __A, const int __imm)
+{
+  return (__m128) __builtin_ia32_extractf32x4_mask ((__v16sf) __A,
+                                                   __imm,
+                                                   (__v4sf)
+                                                   _mm_setzero_ps (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_extracti64x4_epi64 (__m512i __A, const int __imm)
+{
+  return (__m256i) __builtin_ia32_extracti64x4_mask ((__v8di) __A,
+                                                    __imm,
+                                                    (__v4di)
+                                                    _mm256_undefined_si256 (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_extracti64x4_epi64 (__m256i __W, __mmask8 __U, __m512i __A,
+                               const int __imm)
+{
+  return (__m256i) __builtin_ia32_extracti64x4_mask ((__v8di) __A,
+                                                    __imm,
+                                                    (__v4di) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_extracti64x4_epi64 (__mmask8 __U, __m512i __A, const int __imm)
+{
+  return (__m256i) __builtin_ia32_extracti64x4_mask ((__v8di) __A,
+                                                    __imm,
+                                                    (__v4di)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_extracti32x4_epi32 (__m512i __A, const int __imm)
+{
+  return (__m128i) __builtin_ia32_extracti32x4_mask ((__v16si) __A,
+                                                    __imm,
+                                                    (__v4si)
+                                                    _mm_undefined_si128 (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_extracti32x4_epi32 (__m128i __W, __mmask8 __U, __m512i __A,
+                               const int __imm)
+{
+  return (__m128i) __builtin_ia32_extracti32x4_mask ((__v16si) __A,
+                                                    __imm,
+                                                    (__v4si) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_extracti32x4_epi32 (__mmask8 __U, __m512i __A, const int __imm)
+{
+  return (__m128i) __builtin_ia32_extracti32x4_mask ((__v16si) __A,
+                                                    __imm,
+                                                    (__v4si)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) __U);
+}
+#else
+
+#define _mm512_extractf64x4_pd(X, C)                                    \
+  ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (X),   \
+    (int) (C),\
+    (__v4df)(__m256d)_mm256_undefined_pd(),\
+    (__mmask8)-1))
+
+#define _mm512_mask_extractf64x4_pd(W, U, X, C)                         \
+  ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (X),   \
+    (int) (C),\
+    (__v4df)(__m256d)(W),\
+    (__mmask8)(U)))
+
+#define _mm512_maskz_extractf64x4_pd(U, X, C)                           \
+  ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (X),   \
+    (int) (C),\
+    (__v4df)(__m256d)_mm256_setzero_pd(),\
+    (__mmask8)(U)))
+
+#define _mm512_extractf32x4_ps(X, C)                                    \
+  ((__m128) __builtin_ia32_extractf32x4_mask ((__v16sf)(__m512) (X),    \
+    (int) (C),\
+    (__v4sf)(__m128)_mm_undefined_ps(),\
+    (__mmask8)-1))
+
+#define _mm512_mask_extractf32x4_ps(W, U, X, C)                         \
+  ((__m128) __builtin_ia32_extractf32x4_mask ((__v16sf)(__m512) (X),    \
+    (int) (C),\
+    (__v4sf)(__m128)(W),\
+    (__mmask8)(U)))
+
+#define _mm512_maskz_extractf32x4_ps(U, X, C)                           \
+  ((__m128) __builtin_ia32_extractf32x4_mask ((__v16sf)(__m512) (X),    \
+    (int) (C),\
+    (__v4sf)(__m128)_mm_setzero_ps(),\
+    (__mmask8)(U)))
+
+#define _mm512_extracti64x4_epi64(X, C)                                 \
+  ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (X),   \
+    (int) (C),\
+    (__v4di)(__m256i)_mm256_undefined_si256 (),\
+    (__mmask8)-1))
+
+#define _mm512_mask_extracti64x4_epi64(W, U, X, C)                      \
+  ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (X),   \
+    (int) (C),\
+    (__v4di)(__m256i)(W),\
+    (__mmask8)(U)))
+
+#define _mm512_maskz_extracti64x4_epi64(U, X, C)                        \
+  ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (X),   \
+    (int) (C),\
+    (__v4di)(__m256i)_mm256_setzero_si256 (),\
+    (__mmask8)(U)))
+
+#define _mm512_extracti32x4_epi32(X, C)                                 \
+  ((__m128i) __builtin_ia32_extracti32x4_mask ((__v16si)(__m512i) (X),  \
+    (int) (C),\
+    (__v4si)(__m128i)_mm_undefined_si128 (),\
+    (__mmask8)-1))
+
+#define _mm512_mask_extracti32x4_epi32(W, U, X, C)                      \
+  ((__m128i) __builtin_ia32_extracti32x4_mask ((__v16si)(__m512i) (X),  \
+    (int) (C),\
+    (__v4si)(__m128i)(W),\
+    (__mmask8)(U)))
+
+#define _mm512_maskz_extracti32x4_epi32(U, X, C)                        \
+  ((__m128i) __builtin_ia32_extracti32x4_mask ((__v16si)(__m512i) (X),  \
+    (int) (C),\
+    (__v4si)(__m128i)_mm_setzero_si128 (),\
+    (__mmask8)(U)))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_inserti32x4 (__m512i __A, __m128i __B, const int __imm)
+{
+  return (__m512i) __builtin_ia32_inserti32x4_mask ((__v16si) __A,
+                                                   (__v4si) __B,
+                                                   __imm,
+                                                   (__v16si) __A, -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_insertf32x4 (__m512 __A, __m128 __B, const int __imm)
+{
+  return (__m512) __builtin_ia32_insertf32x4_mask ((__v16sf) __A,
+                                                  (__v4sf) __B,
+                                                  __imm,
+                                                  (__v16sf) __A, -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_inserti64x4 (__m512i __A, __m256i __B, const int __imm)
+{
+  return (__m512i) __builtin_ia32_inserti64x4_mask ((__v8di) __A,
+                                                   (__v4di) __B,
+                                                   __imm,
+                                                   (__v8di)
+                                                   _mm512_undefined_epi32 (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_inserti64x4 (__m512i __W, __mmask8 __U, __m512i __A,
+                        __m256i __B, const int __imm)
+{
+  return (__m512i) __builtin_ia32_inserti64x4_mask ((__v8di) __A,
+                                                   (__v4di) __B,
+                                                   __imm,
+                                                   (__v8di) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_inserti64x4 (__mmask8 __U, __m512i __A, __m256i __B,
+                         const int __imm)
+{
+  return (__m512i) __builtin_ia32_inserti64x4_mask ((__v8di) __A,
+                                                   (__v4di) __B,
+                                                   __imm,
+                                                   (__v8di)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_insertf64x4 (__m512d __A, __m256d __B, const int __imm)
+{
+  return (__m512d) __builtin_ia32_insertf64x4_mask ((__v8df) __A,
+                                                   (__v4df) __B,
+                                                   __imm,
+                                                   (__v8df)
+                                                   _mm512_undefined_pd (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_insertf64x4 (__m512d __W, __mmask8 __U, __m512d __A,
+                        __m256d __B, const int __imm)
+{
+  return (__m512d) __builtin_ia32_insertf64x4_mask ((__v8df) __A,
+                                                   (__v4df) __B,
+                                                   __imm,
+                                                   (__v8df) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_insertf64x4 (__mmask8 __U, __m512d __A, __m256d __B,
+                         const int __imm)
+{
+  return (__m512d) __builtin_ia32_insertf64x4_mask ((__v8df) __A,
+                                                   (__v4df) __B,
+                                                   __imm,
+                                                   (__v8df)
+                                                   _mm512_setzero_pd (),
+                                                   (__mmask8) __U);
+}
+#else
+#define _mm512_insertf32x4(X, Y, C)                                     \
+  ((__m512) __builtin_ia32_insertf32x4_mask ((__v16sf)(__m512) (X),     \
+    (__v4sf)(__m128) (Y), (int) (C), (__v16sf)(__m512) (X), (__mmask16)(-1)))
+
+#define _mm512_inserti32x4(X, Y, C)                                     \
+  ((__m512i) __builtin_ia32_inserti32x4_mask ((__v16si)(__m512i) (X),   \
+    (__v4si)(__m128i) (Y), (int) (C), (__v16si)(__m512i) (X), (__mmask16)(-1)))
+
+#define _mm512_insertf64x4(X, Y, C)                                     \
+  ((__m512d) __builtin_ia32_insertf64x4_mask ((__v8df)(__m512d) (X),    \
+    (__v4df)(__m256d) (Y), (int) (C),                                  \
+    (__v8df)(__m512d)_mm512_undefined_pd(),                            \
+    (__mmask8)-1))
+
+#define _mm512_mask_insertf64x4(W, U, X, Y, C)                          \
+  ((__m512d) __builtin_ia32_insertf64x4_mask ((__v8df)(__m512d) (X),    \
+    (__v4df)(__m256d) (Y), (int) (C),                                  \
+    (__v8df)(__m512d)(W),                                              \
+    (__mmask8)(U)))
+
+#define _mm512_maskz_insertf64x4(U, X, Y, C)                            \
+  ((__m512d) __builtin_ia32_insertf64x4_mask ((__v8df)(__m512d) (X),    \
+    (__v4df)(__m256d) (Y), (int) (C),                                  \
+    (__v8df)(__m512d)_mm512_setzero_pd(),                              \
+    (__mmask8)(U)))
+
+#define _mm512_inserti64x4(X, Y, C)                                     \
+  ((__m512i) __builtin_ia32_inserti64x4_mask ((__v8di)(__m512i) (X),    \
+    (__v4di)(__m256i) (Y), (int) (C),                                  \
+    (__v8di)(__m512i)_mm512_undefined_epi32 (),                                \
+    (__mmask8)-1))
+
+#define _mm512_mask_inserti64x4(W, U, X, Y, C)                          \
+  ((__m512i) __builtin_ia32_inserti64x4_mask ((__v8di)(__m512i) (X),    \
+    (__v4di)(__m256i) (Y), (int) (C),\
+    (__v8di)(__m512i)(W),\
+    (__mmask8)(U)))
+
+#define _mm512_maskz_inserti64x4(U, X, Y, C)                            \
+  ((__m512i) __builtin_ia32_inserti64x4_mask ((__v8di)(__m512i) (X),    \
+    (__v4di)(__m256i) (Y), (int) (C),                                  \
+    (__v8di)(__m512i)_mm512_setzero_si512 (),                          \
+    (__mmask8)(U)))
+#endif
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_loadu_pd (void const *__P)
+{
+  return *(__m512d_u *)__P;
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P,
+                                                  (__v8df) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_loadu_pd (__mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P,
+                                                  (__v8df)
+                                                  _mm512_setzero_pd (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_storeu_pd (void *__P, __m512d __A)
+{
+  *(__m512d_u *)__P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_storeu_pd (void *__P, __mmask8 __U, __m512d __A)
+{
+  __builtin_ia32_storeupd512_mask ((double *) __P, (__v8df) __A,
+                                  (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_loadu_ps (void const *__P)
+{
+  return *(__m512_u *)__P;
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P,
+                                                 (__v16sf) __W,
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_loadu_ps (__mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P,
+                                                 (__v16sf)
+                                                 _mm512_setzero_ps (),
+                                                 (__mmask16) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_storeu_ps (void *__P, __m512 __A)
+{
+  *(__m512_u *)__P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_storeu_ps (void *__P, __mmask16 __U, __m512 __A)
+{
+  __builtin_ia32_storeups512_mask ((float *) __P, (__v16sf) __A,
+                                  (__mmask16) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_load_ss (__m128 __W, __mmask8 __U, const float *__P)
+{
+  return (__m128) __builtin_ia32_loadss_mask (__P, (__v4sf) __W, __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_load_ss (__mmask8 __U, const float *__P)
+{
+  return (__m128) __builtin_ia32_loadss_mask (__P, (__v4sf) _mm_setzero_ps (),
+                                             __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_load_sd (__m128d __W, __mmask8 __U, const double *__P)
+{
+  return (__m128d) __builtin_ia32_loadsd_mask (__P, (__v2df) __W, __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_load_sd (__mmask8 __U, const double *__P)
+{
+  return (__m128d) __builtin_ia32_loadsd_mask (__P, (__v2df) _mm_setzero_pd (),
+                                              __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_movess_mask ((__v4sf) __A, (__v4sf) __B,
+                                             (__v4sf) __W, __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_movess_mask ((__v4sf) __A, (__v4sf) __B,
+                                             (__v4sf) _mm_setzero_ps (), __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_movesd_mask ((__v2df) __A, (__v2df) __B,
+                                              (__v2df) __W, __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_movesd_mask ((__v2df) __A, (__v2df) __B,
+                                              (__v2df) _mm_setzero_pd (),
+                                              __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_store_ss (float *__P, __mmask8 __U, __m128 __A)
+{
+  __builtin_ia32_storess_mask (__P, (__v4sf) __A, (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_store_sd (double *__P, __mmask8 __U, __m128d __A)
+{
+  __builtin_ia32_storesd_mask (__P, (__v2df) __A, (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_loadu_epi64 (void const *__P)
+{
+  return *(__m512i_u *) __P;
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P,
+                                                    (__v8di) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_loadu_epi64 (__mmask8 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P,
+                                                    (__v8di)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_storeu_epi64 (void *__P, __m512i __A)
+{
+  *(__m512i_u *) __P = (__m512i_u) __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_storeu_epi64 (void *__P, __mmask8 __U, __m512i __A)
+{
+  __builtin_ia32_storedqudi512_mask ((long long *) __P, (__v8di) __A,
+                                    (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_loadu_si512 (void const *__P)
+{
+  return *(__m512i_u *)__P;
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_loadu_epi32 (void const *__P)
+{
+  return *(__m512i_u *) __P;
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P,
+                                                    (__v16si) __W,
+                                                    (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_loadu_epi32 (__mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P,
+                                                    (__v16si)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask16) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_storeu_si512 (void *__P, __m512i __A)
+{
+  *(__m512i_u *)__P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_storeu_epi32 (void *__P, __m512i __A)
+{
+  *(__m512i_u *) __P = (__m512i_u) __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_storeu_epi32 (void *__P, __mmask16 __U, __m512i __A)
+{
+  __builtin_ia32_storedqusi512_mask ((int *) __P, (__v16si) __A,
+                                    (__mmask16) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutevar_pd (__m512d __A, __m512i __C)
+{
+  return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A,
+                                                       (__v8di) __C,
+                                                       (__v8df)
+                                                       _mm512_undefined_pd (),
+                                                       (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutevar_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512i __C)
+{
+  return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A,
+                                                       (__v8di) __C,
+                                                       (__v8df) __W,
+                                                       (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutevar_pd (__mmask8 __U, __m512d __A, __m512i __C)
+{
+  return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A,
+                                                       (__v8di) __C,
+                                                       (__v8df)
+                                                       _mm512_setzero_pd (),
+                                                       (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutevar_ps (__m512 __A, __m512i __C)
+{
+  return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A,
+                                                      (__v16si) __C,
+                                                      (__v16sf)
+                                                      _mm512_undefined_ps (),
+                                                      (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutevar_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512i __C)
+{
+  return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A,
+                                                      (__v16si) __C,
+                                                      (__v16sf) __W,
+                                                      (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutevar_ps (__mmask16 __U, __m512 __A, __m512i __C)
+{
+  return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A,
+                                                      (__v16si) __C,
+                                                      (__v16sf)
+                                                      _mm512_setzero_ps (),
+                                                      (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutex2var_epi64 (__m512i __A, __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I
+                                                      /* idx */ ,
+                                                      (__v8di) __A,
+                                                      (__v8di) __B,
+                                                      (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutex2var_epi64 (__m512i __A, __mmask8 __U, __m512i __I,
+                               __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I
+                                                      /* idx */ ,
+                                                      (__v8di) __A,
+                                                      (__v8di) __B,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask2_permutex2var_epi64 (__m512i __A, __m512i __I,
+                                __mmask8 __U, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermi2varq512_mask ((__v8di) __A,
+                                                      (__v8di) __I
+                                                      /* idx */ ,
+                                                      (__v8di) __B,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutex2var_epi64 (__mmask8 __U, __m512i __A,
+                                __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varq512_maskz ((__v8di) __I
+                                                       /* idx */ ,
+                                                       (__v8di) __A,
+                                                       (__v8di) __B,
+                                                       (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutex2var_epi32 (__m512i __A, __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I
+                                                      /* idx */ ,
+                                                      (__v16si) __A,
+                                                      (__v16si) __B,
+                                                      (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutex2var_epi32 (__m512i __A, __mmask16 __U,
+                               __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I
+                                                      /* idx */ ,
+                                                      (__v16si) __A,
+                                                      (__v16si) __B,
+                                                      (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask2_permutex2var_epi32 (__m512i __A, __m512i __I,
+                                __mmask16 __U, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermi2vard512_mask ((__v16si) __A,
+                                                      (__v16si) __I
+                                                      /* idx */ ,
+                                                      (__v16si) __B,
+                                                      (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutex2var_epi32 (__mmask16 __U, __m512i __A,
+                                __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2vard512_maskz ((__v16si) __I
+                                                       /* idx */ ,
+                                                       (__v16si) __A,
+                                                       (__v16si) __B,
+                                                       (__mmask16) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutex2var_pd (__m512d __A, __m512i __I, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I
+                                                       /* idx */ ,
+                                                       (__v8df) __A,
+                                                       (__v8df) __B,
+                                                       (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutex2var_pd (__m512d __A, __mmask8 __U, __m512i __I,
+                            __m512d __B)
+{
+  return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I
+                                                       /* idx */ ,
+                                                       (__v8df) __A,
+                                                       (__v8df) __B,
+                                                       (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask2_permutex2var_pd (__m512d __A, __m512i __I, __mmask8 __U,
+                             __m512d __B)
+{
+  return (__m512d) __builtin_ia32_vpermi2varpd512_mask ((__v8df) __A,
+                                                       (__v8di) __I
+                                                       /* idx */ ,
+                                                       (__v8df) __B,
+                                                       (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutex2var_pd (__mmask8 __U, __m512d __A, __m512i __I,
+                             __m512d __B)
+{
+  return (__m512d) __builtin_ia32_vpermt2varpd512_maskz ((__v8di) __I
+                                                        /* idx */ ,
+                                                        (__v8df) __A,
+                                                        (__v8df) __B,
+                                                        (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutex2var_ps (__m512 __A, __m512i __I, __m512 __B)
+{
+  return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I
+                                                      /* idx */ ,
+                                                      (__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutex2var_ps (__m512 __A, __mmask16 __U, __m512i __I, __m512 __B)
+{
+  return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I
+                                                      /* idx */ ,
+                                                      (__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask2_permutex2var_ps (__m512 __A, __m512i __I, __mmask16 __U,
+                             __m512 __B)
+{
+  return (__m512) __builtin_ia32_vpermi2varps512_mask ((__v16sf) __A,
+                                                      (__v16si) __I
+                                                      /* idx */ ,
+                                                      (__v16sf) __B,
+                                                      (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutex2var_ps (__mmask16 __U, __m512 __A, __m512i __I,
+                             __m512 __B)
+{
+  return (__m512) __builtin_ia32_vpermt2varps512_maskz ((__v16si) __I
+                                                       /* idx */ ,
+                                                       (__v16sf) __A,
+                                                       (__v16sf) __B,
+                                                       (__mmask16) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permute_pd (__m512d __X, const int __C)
+{
+  return (__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df) __X, __C,
+                                                    (__v8df)
+                                                    _mm512_undefined_pd (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permute_pd (__m512d __W, __mmask8 __U, __m512d __X, const int __C)
+{
+  return (__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df) __X, __C,
+                                                    (__v8df) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permute_pd (__mmask8 __U, __m512d __X, const int __C)
+{
+  return (__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df) __X, __C,
+                                                    (__v8df)
+                                                    _mm512_setzero_pd (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permute_ps (__m512 __X, const int __C)
+{
+  return (__m512) __builtin_ia32_vpermilps512_mask ((__v16sf) __X, __C,
+                                                   (__v16sf)
+                                                   _mm512_undefined_ps (),
+                                                   (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permute_ps (__m512 __W, __mmask16 __U, __m512 __X, const int __C)
+{
+  return (__m512) __builtin_ia32_vpermilps512_mask ((__v16sf) __X, __C,
+                                                   (__v16sf) __W,
+                                                   (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permute_ps (__mmask16 __U, __m512 __X, const int __C)
+{
+  return (__m512) __builtin_ia32_vpermilps512_mask ((__v16sf) __X, __C,
+                                                   (__v16sf)
+                                                   _mm512_setzero_ps (),
+                                                   (__mmask16) __U);
+}
+#else
+#define _mm512_permute_pd(X, C)                                                            \
+  ((__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df)(__m512d)(X), (int)(C),     \
+                                             (__v8df)(__m512d)_mm512_undefined_pd(),\
+                                             (__mmask8)(-1)))
+
+#define _mm512_mask_permute_pd(W, U, X, C)                                         \
+  ((__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df)(__m512d)(X), (int)(C),     \
+                                             (__v8df)(__m512d)(W),                 \
+                                             (__mmask8)(U)))
+
+#define _mm512_maskz_permute_pd(U, X, C)                                           \
+  ((__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df)(__m512d)(X), (int)(C),     \
+                                             (__v8df)(__m512d)_mm512_setzero_pd(), \
+                                             (__mmask8)(U)))
+
+#define _mm512_permute_ps(X, C)                                                            \
+  ((__m512) __builtin_ia32_vpermilps512_mask ((__v16sf)(__m512)(X), (int)(C),      \
+                                             (__v16sf)(__m512)_mm512_undefined_ps(),\
+                                             (__mmask16)(-1)))
+
+#define _mm512_mask_permute_ps(W, U, X, C)                                         \
+  ((__m512) __builtin_ia32_vpermilps512_mask ((__v16sf)(__m512)(X), (int)(C),      \
+                                             (__v16sf)(__m512)(W),                 \
+                                             (__mmask16)(U)))
+
+#define _mm512_maskz_permute_ps(U, X, C)                                           \
+  ((__m512) __builtin_ia32_vpermilps512_mask ((__v16sf)(__m512)(X), (int)(C),      \
+                                             (__v16sf)(__m512)_mm512_setzero_ps(), \
+                                             (__mmask16)(U)))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutex_epi64 (__m512i __X, const int __I)
+{
+  return (__m512i) __builtin_ia32_permdi512_mask ((__v8di) __X, __I,
+                                                 (__v8di)
+                                                 _mm512_undefined_epi32 (),
+                                                 (__mmask8) (-1));
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutex_epi64 (__m512i __W, __mmask8 __M,
+                           __m512i __X, const int __I)
+{
+  return (__m512i) __builtin_ia32_permdi512_mask ((__v8di) __X, __I,
+                                                 (__v8di) __W,
+                                                 (__mmask8) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutex_epi64 (__mmask8 __M, __m512i __X, const int __I)
+{
+  return (__m512i) __builtin_ia32_permdi512_mask ((__v8di) __X, __I,
+                                                 (__v8di)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask8) __M);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutex_pd (__m512d __X, const int __M)
+{
+  return (__m512d) __builtin_ia32_permdf512_mask ((__v8df) __X, __M,
+                                                 (__v8df)
+                                                 _mm512_undefined_pd (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutex_pd (__m512d __W, __mmask8 __U, __m512d __X, const int __M)
+{
+  return (__m512d) __builtin_ia32_permdf512_mask ((__v8df) __X, __M,
+                                                 (__v8df) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutex_pd (__mmask8 __U, __m512d __X, const int __M)
+{
+  return (__m512d) __builtin_ia32_permdf512_mask ((__v8df) __X, __M,
+                                                 (__v8df)
+                                                 _mm512_setzero_pd (),
+                                                 (__mmask8) __U);
+}
+#else
+#define _mm512_permutex_pd(X, M)                                               \
+  ((__m512d) __builtin_ia32_permdf512_mask ((__v8df)(__m512d)(X), (int)(M),    \
+                                           (__v8df)(__m512d)_mm512_undefined_pd(),\
+                                           (__mmask8)-1))
+
+#define _mm512_mask_permutex_pd(W, U, X, M)                                    \
+  ((__m512d) __builtin_ia32_permdf512_mask ((__v8df)(__m512d)(X), (int)(M),    \
+                                           (__v8df)(__m512d)(W), (__mmask8)(U)))
+
+#define _mm512_maskz_permutex_pd(U, X, M)                                      \
+  ((__m512d) __builtin_ia32_permdf512_mask ((__v8df)(__m512d)(X), (int)(M),    \
+                                           (__v8df)(__m512d)_mm512_setzero_pd(),\
+                                           (__mmask8)(U)))
+
+#define _mm512_permutex_epi64(X, I)                              \
+  ((__m512i) __builtin_ia32_permdi512_mask ((__v8di)(__m512i)(X), \
+                                           (int)(I),             \
+                                           (__v8di)(__m512i)     \
+                                           (_mm512_undefined_epi32 ()),\
+                                           (__mmask8)(-1)))
+
+#define _mm512_maskz_permutex_epi64(M, X, I)                 \
+  ((__m512i) __builtin_ia32_permdi512_mask ((__v8di)(__m512i)(X), \
+                                           (int)(I),             \
+                                           (__v8di)(__m512i)     \
+                                           (_mm512_setzero_si512 ()),\
+                                           (__mmask8)(M)))
+
+#define _mm512_mask_permutex_epi64(W, M, X, I)               \
+  ((__m512i) __builtin_ia32_permdi512_mask ((__v8di)(__m512i)(X), \
+                                           (int)(I),             \
+                                           (__v8di)(__m512i)(W), \
+                                           (__mmask8)(M)))
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
+                                                    (__v8di) __X,
+                                                    (__v8di)
+                                                    _mm512_setzero_si512 (),
+                                                    __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutexvar_epi64 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
+                                                    (__v8di) __X,
+                                                    (__v8di)
+                                                    _mm512_undefined_epi32 (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X,
+                              __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
+                                                    (__v8di) __X,
+                                                    (__v8di) __W,
+                                                    __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
+                                                    (__v16si) __X,
+                                                    (__v16si)
+                                                    _mm512_setzero_si512 (),
+                                                    __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutexvar_epi32 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
+                                                    (__v16si) __X,
+                                                    (__v16si)
+                                                    _mm512_undefined_epi32 (),
+                                                    (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X,
+                              __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
+                                                    (__v16si) __X,
+                                                    (__v16si) __W,
+                                                    __M);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutexvar_pd (__m512i __X, __m512d __Y)
+{
+  return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
+                                                    (__v8di) __X,
+                                                    (__v8df)
+                                                    _mm512_undefined_pd (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y)
+{
+  return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
+                                                    (__v8di) __X,
+                                                    (__v8df) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y)
+{
+  return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
+                                                    (__v8di) __X,
+                                                    (__v8df)
+                                                    _mm512_setzero_pd (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutexvar_ps (__m512i __X, __m512 __Y)
+{
+  return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
+                                                   (__v16si) __X,
+                                                   (__v16sf)
+                                                   _mm512_undefined_ps (),
+                                                   (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y)
+{
+  return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
+                                                   (__v16si) __X,
+                                                   (__v16sf) __W,
+                                                   (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y)
+{
+  return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
+                                                   (__v16si) __X,
+                                                   (__v16sf)
+                                                   _mm512_setzero_ps (),
+                                                   (__mmask16) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shuffle_ps (__m512 __M, __m512 __V, const int __imm)
+{
+  return (__m512) __builtin_ia32_shufps512_mask ((__v16sf) __M,
+                                                (__v16sf) __V, __imm,
+                                                (__v16sf)
+                                                _mm512_undefined_ps (),
+                                                (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shuffle_ps (__m512 __W, __mmask16 __U, __m512 __M,
+                       __m512 __V, const int __imm)
+{
+  return (__m512) __builtin_ia32_shufps512_mask ((__v16sf) __M,
+                                                (__v16sf) __V, __imm,
+                                                (__v16sf) __W,
+                                                (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shuffle_ps (__mmask16 __U, __m512 __M, __m512 __V, const int __imm)
+{
+  return (__m512) __builtin_ia32_shufps512_mask ((__v16sf) __M,
+                                                (__v16sf) __V, __imm,
+                                                (__v16sf)
+                                                _mm512_setzero_ps (),
+                                                (__mmask16) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shuffle_pd (__m512d __M, __m512d __V, const int __imm)
+{
+  return (__m512d) __builtin_ia32_shufpd512_mask ((__v8df) __M,
+                                                 (__v8df) __V, __imm,
+                                                 (__v8df)
+                                                 _mm512_undefined_pd (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shuffle_pd (__m512d __W, __mmask8 __U, __m512d __M,
+                       __m512d __V, const int __imm)
+{
+  return (__m512d) __builtin_ia32_shufpd512_mask ((__v8df) __M,
+                                                 (__v8df) __V, __imm,
+                                                 (__v8df) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shuffle_pd (__mmask8 __U, __m512d __M, __m512d __V,
+                        const int __imm)
+{
+  return (__m512d) __builtin_ia32_shufpd512_mask ((__v8df) __M,
+                                                 (__v8df) __V, __imm,
+                                                 (__v8df)
+                                                 _mm512_setzero_pd (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fixupimm_round_pd (__m512d __A, __m512d __B, __m512i __C,
+                         const int __imm, const int __R)
+{
+  return (__m512d) __builtin_ia32_fixupimmpd512_mask ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8di) __C,
+                                                     __imm,
+                                                     (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fixupimm_round_pd (__m512d __A, __mmask8 __U, __m512d __B,
+                              __m512i __C, const int __imm, const int __R)
+{
+  return (__m512d) __builtin_ia32_fixupimmpd512_mask ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8di) __C,
+                                                     __imm,
+                                                     (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fixupimm_round_pd (__mmask8 __U, __m512d __A, __m512d __B,
+                               __m512i __C, const int __imm, const int __R)
+{
+  return (__m512d) __builtin_ia32_fixupimmpd512_maskz ((__v8df) __A,
+                                                      (__v8df) __B,
+                                                      (__v8di) __C,
+                                                      __imm,
+                                                      (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fixupimm_round_ps (__m512 __A, __m512 __B, __m512i __C,
+                         const int __imm, const int __R)
+{
+  return (__m512) __builtin_ia32_fixupimmps512_mask ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16si) __C,
+                                                    __imm,
+                                                    (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fixupimm_round_ps (__m512 __A, __mmask16 __U, __m512 __B,
+                              __m512i __C, const int __imm, const int __R)
+{
+  return (__m512) __builtin_ia32_fixupimmps512_mask ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16si) __C,
+                                                    __imm,
+                                                    (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fixupimm_round_ps (__mmask16 __U, __m512 __A, __m512 __B,
+                               __m512i __C, const int __imm, const int __R)
+{
+  return (__m512) __builtin_ia32_fixupimmps512_maskz ((__v16sf) __A,
+                                                     (__v16sf) __B,
+                                                     (__v16si) __C,
+                                                     __imm,
+                                                     (__mmask16) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fixupimm_round_sd (__m128d __A, __m128d __B, __m128i __C,
+                      const int __imm, const int __R)
+{
+  return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A,
+                                                  (__v2df) __B,
+                                                  (__v2di) __C, __imm,
+                                                  (__mmask8) -1, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fixupimm_round_sd (__m128d __A, __mmask8 __U, __m128d __B,
+                           __m128i __C, const int __imm, const int __R)
+{
+  return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A,
+                                                  (__v2df) __B,
+                                                  (__v2di) __C, __imm,
+                                                  (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fixupimm_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
+                            __m128i __C, const int __imm, const int __R)
+{
+  return (__m128d) __builtin_ia32_fixupimmsd_maskz ((__v2df) __A,
+                                                   (__v2df) __B,
+                                                   (__v2di) __C,
+                                                   __imm,
+                                                   (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fixupimm_round_ss (__m128 __A, __m128 __B, __m128i __C,
+                      const int __imm, const int __R)
+{
+  return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A,
+                                                 (__v4sf) __B,
+                                                 (__v4si) __C, __imm,
+                                                 (__mmask8) -1, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fixupimm_round_ss (__m128 __A, __mmask8 __U, __m128 __B,
+                           __m128i __C, const int __imm, const int __R)
+{
+  return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A,
+                                                 (__v4sf) __B,
+                                                 (__v4si) __C, __imm,
+                                                 (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fixupimm_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
+                            __m128i __C, const int __imm, const int __R)
+{
+  return (__m128) __builtin_ia32_fixupimmss_maskz ((__v4sf) __A,
+                                                  (__v4sf) __B,
+                                                  (__v4si) __C, __imm,
+                                                  (__mmask8) __U, __R);
+}
+
+#else
+#define _mm512_shuffle_pd(X, Y, C)                                      \
+    ((__m512d)__builtin_ia32_shufpd512_mask ((__v8df)(__m512d)(X),           \
+        (__v8df)(__m512d)(Y), (int)(C),\
+    (__v8df)(__m512d)_mm512_undefined_pd(),\
+    (__mmask8)-1))
+
+#define _mm512_mask_shuffle_pd(W, U, X, Y, C)                           \
+    ((__m512d)__builtin_ia32_shufpd512_mask ((__v8df)(__m512d)(X),           \
+        (__v8df)(__m512d)(Y), (int)(C),\
+    (__v8df)(__m512d)(W),\
+    (__mmask8)(U)))
+
+#define _mm512_maskz_shuffle_pd(U, X, Y, C)                             \
+    ((__m512d)__builtin_ia32_shufpd512_mask ((__v8df)(__m512d)(X),           \
+        (__v8df)(__m512d)(Y), (int)(C),\
+    (__v8df)(__m512d)_mm512_setzero_pd(),\
+    (__mmask8)(U)))
+
+#define _mm512_shuffle_ps(X, Y, C)                                      \
+    ((__m512)__builtin_ia32_shufps512_mask ((__v16sf)(__m512)(X),            \
+        (__v16sf)(__m512)(Y), (int)(C),\
+    (__v16sf)(__m512)_mm512_undefined_ps(),\
+    (__mmask16)-1))
+
+#define _mm512_mask_shuffle_ps(W, U, X, Y, C)                           \
+    ((__m512)__builtin_ia32_shufps512_mask ((__v16sf)(__m512)(X),            \
+        (__v16sf)(__m512)(Y), (int)(C),\
+    (__v16sf)(__m512)(W),\
+    (__mmask16)(U)))
+
+#define _mm512_maskz_shuffle_ps(U, X, Y, C)                             \
+    ((__m512)__builtin_ia32_shufps512_mask ((__v16sf)(__m512)(X),            \
+        (__v16sf)(__m512)(Y), (int)(C),\
+    (__v16sf)(__m512)_mm512_setzero_ps(),\
+    (__mmask16)(U)))
+
+#define _mm512_fixupimm_round_pd(X, Y, Z, C, R)                                        \
+  ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X),   \
+      (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C),            \
+      (__mmask8)(-1), (R)))
+
+#define _mm512_mask_fixupimm_round_pd(X, U, Y, Z, C, R)                          \
+  ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X),    \
+      (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C),             \
+      (__mmask8)(U), (R)))
+
+#define _mm512_maskz_fixupimm_round_pd(U, X, Y, Z, C, R)                         \
+  ((__m512d)__builtin_ia32_fixupimmpd512_maskz ((__v8df)(__m512d)(X),   \
+      (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C),             \
+      (__mmask8)(U), (R)))
+
+#define _mm512_fixupimm_round_ps(X, Y, Z, C, R)                                        \
+  ((__m512)__builtin_ia32_fixupimmps512_mask ((__v16sf)(__m512)(X),    \
+    (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C),             \
+    (__mmask16)(-1), (R)))
+
+#define _mm512_mask_fixupimm_round_ps(X, U, Y, Z, C, R)                          \
+  ((__m512)__builtin_ia32_fixupimmps512_mask ((__v16sf)(__m512)(X),     \
+    (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C),              \
+    (__mmask16)(U), (R)))
+
+#define _mm512_maskz_fixupimm_round_ps(U, X, Y, Z, C, R)                         \
+  ((__m512)__builtin_ia32_fixupimmps512_maskz ((__v16sf)(__m512)(X),    \
+    (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C),              \
+    (__mmask16)(U), (R)))
+
+#define _mm_fixupimm_round_sd(X, Y, Z, C, R)                                   \
+    ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X),    \
+      (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C),            \
+      (__mmask8)(-1), (R)))
+
+#define _mm_mask_fixupimm_round_sd(X, U, Y, Z, C, R)                           \
+    ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X),    \
+      (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C),            \
+      (__mmask8)(U), (R)))
+
+#define _mm_maskz_fixupimm_round_sd(U, X, Y, Z, C, R)                          \
+    ((__m128d)__builtin_ia32_fixupimmsd_maskz ((__v2df)(__m128d)(X),   \
+      (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C),            \
+      (__mmask8)(U), (R)))
+
+#define _mm_fixupimm_round_ss(X, Y, Z, C, R)                                   \
+    ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X),      \
+      (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C),             \
+      (__mmask8)(-1), (R)))
+
+#define _mm_mask_fixupimm_round_ss(X, U, Y, Z, C, R)                           \
+    ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X),      \
+      (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C),             \
+      (__mmask8)(U), (R)))
+
+#define _mm_maskz_fixupimm_round_ss(U, X, Y, Z, C, R)                          \
+    ((__m128)__builtin_ia32_fixupimmss_maskz ((__v4sf)(__m128)(X),     \
+      (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C),             \
+      (__mmask8)(U), (R)))
+#endif
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_movehdup_ps (__m512 __A)
+{
+  return (__m512) __builtin_ia32_movshdup512_mask ((__v16sf) __A,
+                                                  (__v16sf)
+                                                  _mm512_undefined_ps (),
+                                                  (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_movshdup512_mask ((__v16sf) __A,
+                                                  (__v16sf) __W,
+                                                  (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_movshdup512_mask ((__v16sf) __A,
+                                                  (__v16sf)
+                                                  _mm512_setzero_ps (),
+                                                  (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_moveldup_ps (__m512 __A)
+{
+  return (__m512) __builtin_ia32_movsldup512_mask ((__v16sf) __A,
+                                                  (__v16sf)
+                                                  _mm512_undefined_ps (),
+                                                  (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_movsldup512_mask ((__v16sf) __A,
+                                                  (__v16sf) __W,
+                                                  (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_movsldup512_mask ((__v16sf) __A,
+                                                  (__v16sf)
+                                                  _mm512_setzero_ps (),
+                                                  (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_or_si512 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v16su) __A | (__v16su) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_or_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v16su) __A | (__v16su) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_or_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pord512_mask ((__v16si) __A,
+                                               (__v16si) __B,
+                                               (__v16si) __W,
+                                               (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_or_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pord512_mask ((__v16si) __A,
+                                               (__v16si) __B,
+                                               (__v16si)
+                                               _mm512_setzero_si512 (),
+                                               (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_or_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v8du) __A | (__v8du) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_or_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __A,
+                                               (__v8di) __B,
+                                               (__v8di) __W,
+                                               (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_or_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __A,
+                                               (__v8di) __B,
+                                               (__v8di)
+                                               _mm512_setzero_si512 (),
+                                               (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_xor_si512 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v16su) __A ^ (__v16su) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_xor_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v16su) __A ^ (__v16su) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_xor_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pxord512_mask ((__v16si) __A,
+                                                (__v16si) __B,
+                                                (__v16si) __W,
+                                                (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_xor_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pxord512_mask ((__v16si) __A,
+                                                (__v16si) __B,
+                                                (__v16si)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_xor_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v8du) __A ^ (__v8du) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_xor_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __A,
+                                                (__v8di) __B,
+                                                (__v8di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_xor_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __A,
+                                                (__v8di) __B,
+                                                (__v8di)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask8) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rol_epi32 (__m512i __A, const int __B)
+{
+  return (__m512i) __builtin_ia32_prold512_mask ((__v16si) __A, __B,
+                                                (__v16si)
+                                                _mm512_undefined_epi32 (),
+                                                (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rol_epi32 (__m512i __W, __mmask16 __U, __m512i __A, const int __B)
+{
+  return (__m512i) __builtin_ia32_prold512_mask ((__v16si) __A, __B,
+                                                (__v16si) __W,
+                                                (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rol_epi32 (__mmask16 __U, __m512i __A, const int __B)
+{
+  return (__m512i) __builtin_ia32_prold512_mask ((__v16si) __A, __B,
+                                                (__v16si)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_ror_epi32 (__m512i __A, int __B)
+{
+  return (__m512i) __builtin_ia32_prord512_mask ((__v16si) __A, __B,
+                                                (__v16si)
+                                                _mm512_undefined_epi32 (),
+                                                (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_ror_epi32 (__m512i __W, __mmask16 __U, __m512i __A, int __B)
+{
+  return (__m512i) __builtin_ia32_prord512_mask ((__v16si) __A, __B,
+                                                (__v16si) __W,
+                                                (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_ror_epi32 (__mmask16 __U, __m512i __A, int __B)
+{
+  return (__m512i) __builtin_ia32_prord512_mask ((__v16si) __A, __B,
+                                                (__v16si)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rol_epi64 (__m512i __A, const int __B)
+{
+  return (__m512i) __builtin_ia32_prolq512_mask ((__v8di) __A, __B,
+                                                (__v8di)
+                                                _mm512_undefined_epi32 (),
+                                                (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rol_epi64 (__m512i __W, __mmask8 __U, __m512i __A, const int __B)
+{
+  return (__m512i) __builtin_ia32_prolq512_mask ((__v8di) __A, __B,
+                                                (__v8di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rol_epi64 (__mmask8 __U, __m512i __A, const int __B)
+{
+  return (__m512i) __builtin_ia32_prolq512_mask ((__v8di) __A, __B,
+                                                (__v8di)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_ror_epi64 (__m512i __A, int __B)
+{
+  return (__m512i) __builtin_ia32_prorq512_mask ((__v8di) __A, __B,
+                                                (__v8di)
+                                                _mm512_undefined_epi32 (),
+                                                (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_ror_epi64 (__m512i __W, __mmask8 __U, __m512i __A, int __B)
+{
+  return (__m512i) __builtin_ia32_prorq512_mask ((__v8di) __A, __B,
+                                                (__v8di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_ror_epi64 (__mmask8 __U, __m512i __A, int __B)
+{
+  return (__m512i) __builtin_ia32_prorq512_mask ((__v8di) __A, __B,
+                                                (__v8di)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask8) __U);
+}
+
+#else
+#define _mm512_rol_epi32(A, B)                                           \
+    ((__m512i)__builtin_ia32_prold512_mask ((__v16si)(__m512i)(A),       \
+                                           (int)(B),                     \
+                                           (__v16si)_mm512_undefined_epi32 (), \
+                                           (__mmask16)(-1)))
+#define _mm512_mask_rol_epi32(W, U, A, B)                                \
+    ((__m512i)__builtin_ia32_prold512_mask ((__v16si)(__m512i)(A),       \
+                                           (int)(B),                     \
+                                           (__v16si)(__m512i)(W),        \
+                                           (__mmask16)(U)))
+#define _mm512_maskz_rol_epi32(U, A, B)                                          \
+    ((__m512i)__builtin_ia32_prold512_mask ((__v16si)(__m512i)(A),       \
+                                           (int)(B),                     \
+                                           (__v16si)_mm512_setzero_si512 (), \
+                                           (__mmask16)(U)))
+#define _mm512_ror_epi32(A, B)                                           \
+    ((__m512i)__builtin_ia32_prord512_mask ((__v16si)(__m512i)(A),       \
+                                           (int)(B),                     \
+                                           (__v16si)_mm512_undefined_epi32 (), \
+                                           (__mmask16)(-1)))
+#define _mm512_mask_ror_epi32(W, U, A, B)                                \
+    ((__m512i)__builtin_ia32_prord512_mask ((__v16si)(__m512i)(A),       \
+                                           (int)(B),                     \
+                                           (__v16si)(__m512i)(W),        \
+                                           (__mmask16)(U)))
+#define _mm512_maskz_ror_epi32(U, A, B)                                          \
+    ((__m512i)__builtin_ia32_prord512_mask ((__v16si)(__m512i)(A),       \
+                                           (int)(B),                     \
+                                           (__v16si)_mm512_setzero_si512 (), \
+                                           (__mmask16)(U)))
+#define _mm512_rol_epi64(A, B)                                           \
+    ((__m512i)__builtin_ia32_prolq512_mask ((__v8di)(__m512i)(A),        \
+                                           (int)(B),                     \
+                                           (__v8di)_mm512_undefined_epi32 (),  \
+                                           (__mmask8)(-1)))
+#define _mm512_mask_rol_epi64(W, U, A, B)                                \
+    ((__m512i)__builtin_ia32_prolq512_mask ((__v8di)(__m512i)(A),        \
+                                           (int)(B),                     \
+                                           (__v8di)(__m512i)(W),         \
+                                           (__mmask8)(U)))
+#define _mm512_maskz_rol_epi64(U, A, B)                                          \
+    ((__m512i)__builtin_ia32_prolq512_mask ((__v8di)(__m512i)(A),        \
+                                           (int)(B),                     \
+                                           (__v8di)_mm512_setzero_si512 (),  \
+                                           (__mmask8)(U)))
+
+#define _mm512_ror_epi64(A, B)                                           \
+    ((__m512i)__builtin_ia32_prorq512_mask ((__v8di)(__m512i)(A),        \
+                                           (int)(B),                     \
+                                           (__v8di)_mm512_undefined_epi32 (),  \
+                                           (__mmask8)(-1)))
+#define _mm512_mask_ror_epi64(W, U, A, B)                                \
+    ((__m512i)__builtin_ia32_prorq512_mask ((__v8di)(__m512i)(A),        \
+                                           (int)(B),                     \
+                                           (__v8di)(__m512i)(W),         \
+                                           (__mmask8)(U)))
+#define _mm512_maskz_ror_epi64(U, A, B)                                          \
+    ((__m512i)__builtin_ia32_prorq512_mask ((__v8di)(__m512i)(A),        \
+                                           (int)(B),                     \
+                                           (__v8di)_mm512_setzero_si512 (),  \
+                                           (__mmask8)(U)))
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_and_si512 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v16su) __A & (__v16su) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_and_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v16su) __A & (__v16su) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_and_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandd512_mask ((__v16si) __A,
+                                                (__v16si) __B,
+                                                (__v16si) __W,
+                                                (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_and_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandd512_mask ((__v16si) __A,
+                                                (__v16si) __B,
+                                                (__v16si)
+                                                _mm512_setzero_si512 (),
+                                                (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_and_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) ((__v8du) __A & (__v8du) __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_and_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __A,
+                                                (__v8di) __B,
+                                                (__v8di) __W, __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_and_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __A,
+                                                (__v8di) __B,
+                                                (__v8di)
+                                                _mm512_setzero_pd (),
+                                                __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_andnot_si512 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
+                                                 (__v16si) __B,
+                                                 (__v16si)
+                                                 _mm512_undefined_epi32 (),
+                                                 (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_andnot_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
+                                                 (__v16si) __B,
+                                                 (__v16si)
+                                                 _mm512_undefined_epi32 (),
+                                                 (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_andnot_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
+                                                 (__v16si) __B,
+                                                 (__v16si) __W,
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_andnot_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A,
+                                                 (__v16si) __B,
+                                                 (__v16si)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_andnot_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A,
+                                                 (__v8di) __B,
+                                                 (__v8di)
+                                                 _mm512_undefined_epi32 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_andnot_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A,
+                                                 (__v8di) __B,
+                                                 (__v8di) __W, __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_andnot_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A,
+                                                 (__v8di) __B,
+                                                 (__v8di)
+                                                 _mm512_setzero_pd (),
+                                                 __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_test_epi32_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A,
+                                               (__v16si) __B,
+                                               (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A,
+                                               (__v16si) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_test_epi64_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A,
+                                              (__v8di) __B,
+                                              (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A, (__v8di) __B, __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_testn_epi32_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestnmd512 ((__v16si) __A,
+                                                (__v16si) __B,
+                                                (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestnmd512 ((__v16si) __A,
+                                                (__v16si) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_testn_epi64_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmq512 ((__v8di) __A,
+                                               (__v8di) __B,
+                                               (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmq512 ((__v8di) __A,
+                                               (__v8di) __B, __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_abs_ps (__m512 __A)
+{
+  return (__m512) _mm512_and_epi32 ((__m512i) __A,
+                                   _mm512_set1_epi32 (0x7fffffff));
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_abs_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) _mm512_mask_and_epi32 ((__m512i) __W, __U, (__m512i) __A,
+                                        _mm512_set1_epi32 (0x7fffffff));
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_abs_pd (__m512d __A)
+{
+  return (__m512d) _mm512_and_epi64 ((__m512i) __A,
+                                    _mm512_set1_epi64 (0x7fffffffffffffffLL));
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_abs_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d)
+        _mm512_mask_and_epi64 ((__m512i) __W, __U, (__m512i) __A,
+                               _mm512_set1_epi64 (0x7fffffffffffffffLL));
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_unpackhi_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckhdq512_mask ((__v16si) __A,
+                                                    (__v16si) __B,
+                                                    (__v16si)
+                                                    _mm512_undefined_epi32 (),
+                                                    (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_unpackhi_epi32 (__m512i __W, __mmask16 __U, __m512i __A,
+                           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckhdq512_mask ((__v16si) __A,
+                                                    (__v16si) __B,
+                                                    (__v16si) __W,
+                                                    (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_unpackhi_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckhdq512_mask ((__v16si) __A,
+                                                    (__v16si) __B,
+                                                    (__v16si)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_unpackhi_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckhqdq512_mask ((__v8di) __A,
+                                                     (__v8di) __B,
+                                                     (__v8di)
+                                                     _mm512_undefined_epi32 (),
+                                                     (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_unpackhi_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckhqdq512_mask ((__v8di) __A,
+                                                     (__v8di) __B,
+                                                     (__v8di) __W,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_unpackhi_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckhqdq512_mask ((__v8di) __A,
+                                                     (__v8di) __B,
+                                                     (__v8di)
+                                                     _mm512_setzero_si512 (),
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_unpacklo_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckldq512_mask ((__v16si) __A,
+                                                    (__v16si) __B,
+                                                    (__v16si)
+                                                    _mm512_undefined_epi32 (),
+                                                    (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_unpacklo_epi32 (__m512i __W, __mmask16 __U, __m512i __A,
+                           __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckldq512_mask ((__v16si) __A,
+                                                    (__v16si) __B,
+                                                    (__v16si) __W,
+                                                    (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_unpacklo_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpckldq512_mask ((__v16si) __A,
+                                                    (__v16si) __B,
+                                                    (__v16si)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_unpacklo_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpcklqdq512_mask ((__v8di) __A,
+                                                     (__v8di) __B,
+                                                     (__v8di)
+                                                     _mm512_undefined_epi32 (),
+                                                     (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpcklqdq512_mask ((__v8di) __A,
+                                                     (__v8di) __B,
+                                                     (__v8di) __W,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_punpcklqdq512_mask ((__v8di) __A,
+                                                     (__v8di) __B,
+                                                     (__v8di)
+                                                     _mm512_setzero_si512 (),
+                                                     (__mmask8) __U);
+}
+
+#ifdef __x86_64__
+#ifdef __OPTIMIZE__
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundss_u64 (__m128 __A, const int __R)
+{
+  return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf) __A, __R);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundss_si64 (__m128 __A, const int __R)
+{
+  return (long long) __builtin_ia32_vcvtss2si64 ((__v4sf) __A, __R);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundss_i64 (__m128 __A, const int __R)
+{
+  return (long long) __builtin_ia32_vcvtss2si64 ((__v4sf) __A, __R);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundss_u64 (__m128 __A, const int __R)
+{
+  return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf) __A, __R);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundss_i64 (__m128 __A, const int __R)
+{
+  return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, __R);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundss_si64 (__m128 __A, const int __R)
+{
+  return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, __R);
+}
+#else
+#define _mm_cvt_roundss_u64(A, B)   \
+    ((unsigned long long)__builtin_ia32_vcvtss2usi64(A, B))
+
+#define _mm_cvt_roundss_si64(A, B)   \
+    ((long long)__builtin_ia32_vcvtss2si64(A, B))
+
+#define _mm_cvt_roundss_i64(A, B)   \
+    ((long long)__builtin_ia32_vcvtss2si64(A, B))
+
+#define _mm_cvtt_roundss_u64(A, B)  \
+    ((unsigned long long)__builtin_ia32_vcvttss2usi64(A, B))
+
+#define _mm_cvtt_roundss_i64(A, B)  \
+    ((long long)__builtin_ia32_vcvttss2si64(A, B))
+
+#define _mm_cvtt_roundss_si64(A, B)  \
+    ((long long)__builtin_ia32_vcvttss2si64(A, B))
+#endif
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundss_u32 (__m128 __A, const int __R)
+{
+  return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A, __R);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundss_si32 (__m128 __A, const int __R)
+{
+  return (int) __builtin_ia32_vcvtss2si32 ((__v4sf) __A, __R);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundss_i32 (__m128 __A, const int __R)
+{
+  return (int) __builtin_ia32_vcvtss2si32 ((__v4sf) __A, __R);
+}
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundss_u32 (__m128 __A, const int __R)
+{
+  return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A, __R);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundss_i32 (__m128 __A, const int __R)
+{
+  return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, __R);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundss_si32 (__m128 __A, const int __R)
+{
+  return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, __R);
+}
+#else
+#define _mm_cvt_roundss_u32(A, B)   \
+    ((unsigned)__builtin_ia32_vcvtss2usi32(A, B))
+
+#define _mm_cvt_roundss_si32(A, B)   \
+    ((int)__builtin_ia32_vcvtss2si32(A, B))
+
+#define _mm_cvt_roundss_i32(A, B)   \
+    ((int)__builtin_ia32_vcvtss2si32(A, B))
+
+#define _mm_cvtt_roundss_u32(A, B)  \
+    ((unsigned)__builtin_ia32_vcvttss2usi32(A, B))
+
+#define _mm_cvtt_roundss_si32(A, B)  \
+    ((int)__builtin_ia32_vcvttss2si32(A, B))
+
+#define _mm_cvtt_roundss_i32(A, B)  \
+    ((int)__builtin_ia32_vcvttss2si32(A, B))
+#endif
+
+#ifdef __x86_64__
+#ifdef __OPTIMIZE__
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsd_u64 (__m128d __A, const int __R)
+{
+  return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df) __A, __R);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsd_si64 (__m128d __A, const int __R)
+{
+  return (long long) __builtin_ia32_vcvtsd2si64 ((__v2df) __A, __R);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsd_i64 (__m128d __A, const int __R)
+{
+  return (long long) __builtin_ia32_vcvtsd2si64 ((__v2df) __A, __R);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsd_u64 (__m128d __A, const int __R)
+{
+  return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df) __A, __R);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsd_si64 (__m128d __A, const int __R)
+{
+  return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, __R);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsd_i64 (__m128d __A, const int __R)
+{
+  return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, __R);
+}
+#else
+#define _mm_cvt_roundsd_u64(A, B)   \
+    ((unsigned long long)__builtin_ia32_vcvtsd2usi64(A, B))
+
+#define _mm_cvt_roundsd_si64(A, B)   \
+    ((long long)__builtin_ia32_vcvtsd2si64(A, B))
+
+#define _mm_cvt_roundsd_i64(A, B)   \
+    ((long long)__builtin_ia32_vcvtsd2si64(A, B))
+
+#define _mm_cvtt_roundsd_u64(A, B)   \
+    ((unsigned long long)__builtin_ia32_vcvttsd2usi64(A, B))
+
+#define _mm_cvtt_roundsd_si64(A, B)   \
+    ((long long)__builtin_ia32_vcvttsd2si64(A, B))
+
+#define _mm_cvtt_roundsd_i64(A, B)   \
+    ((long long)__builtin_ia32_vcvttsd2si64(A, B))
+#endif
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsd_u32 (__m128d __A, const int __R)
+{
+  return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A, __R);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsd_si32 (__m128d __A, const int __R)
+{
+  return (int) __builtin_ia32_vcvtsd2si32 ((__v2df) __A, __R);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsd_i32 (__m128d __A, const int __R)
+{
+  return (int) __builtin_ia32_vcvtsd2si32 ((__v2df) __A, __R);
+}
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsd_u32 (__m128d __A, const int __R)
+{
+  return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A, __R);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsd_i32 (__m128d __A, const int __R)
+{
+  return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, __R);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsd_si32 (__m128d __A, const int __R)
+{
+  return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, __R);
+}
+#else
+#define _mm_cvt_roundsd_u32(A, B)   \
+    ((unsigned)__builtin_ia32_vcvtsd2usi32(A, B))
+
+#define _mm_cvt_roundsd_si32(A, B)   \
+    ((int)__builtin_ia32_vcvtsd2si32(A, B))
+
+#define _mm_cvt_roundsd_i32(A, B)   \
+    ((int)__builtin_ia32_vcvtsd2si32(A, B))
+
+#define _mm_cvtt_roundsd_u32(A, B)   \
+    ((unsigned)__builtin_ia32_vcvttsd2usi32(A, B))
+
+#define _mm_cvtt_roundsd_si32(A, B)   \
+    ((int)__builtin_ia32_vcvttsd2si32(A, B))
+
+#define _mm_cvtt_roundsd_i32(A, B)   \
+    ((int)__builtin_ia32_vcvttsd2si32(A, B))
+#endif
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_movedup_pd (__m512d __A)
+{
+  return (__m512d) __builtin_ia32_movddup512_mask ((__v8df) __A,
+                                                  (__v8df)
+                                                  _mm512_undefined_pd (),
+                                                  (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_movedup_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_movddup512_mask ((__v8df) __A,
+                                                  (__v8df) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_movddup512_mask ((__v8df) __A,
+                                                  (__v8df)
+                                                  _mm512_setzero_pd (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_unpacklo_pd (__m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_unpcklpd512_mask ((__v8df) __A,
+                                                   (__v8df) __B,
+                                                   (__v8df)
+                                                   _mm512_undefined_pd (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_unpacklo_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_unpcklpd512_mask ((__v8df) __A,
+                                                   (__v8df) __B,
+                                                   (__v8df) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_unpcklpd512_mask ((__v8df) __A,
+                                                   (__v8df) __B,
+                                                   (__v8df)
+                                                   _mm512_setzero_pd (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_unpackhi_pd (__m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_unpckhpd512_mask ((__v8df) __A,
+                                                   (__v8df) __B,
+                                                   (__v8df)
+                                                   _mm512_undefined_pd (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_unpackhi_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_unpckhpd512_mask ((__v8df) __A,
+                                                   (__v8df) __B,
+                                                   (__v8df) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_unpackhi_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_unpckhpd512_mask ((__v8df) __A,
+                                                   (__v8df) __B,
+                                                   (__v8df)
+                                                   _mm512_setzero_pd (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_unpackhi_ps (__m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_unpckhps512_mask ((__v16sf) __A,
+                                                  (__v16sf) __B,
+                                                  (__v16sf)
+                                                  _mm512_undefined_ps (),
+                                                  (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_unpackhi_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_unpckhps512_mask ((__v16sf) __A,
+                                                  (__v16sf) __B,
+                                                  (__v16sf) __W,
+                                                  (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_unpckhps512_mask ((__v16sf) __A,
+                                                  (__v16sf) __B,
+                                                  (__v16sf)
+                                                  _mm512_setzero_ps (),
+                                                  (__mmask16) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundps_pd (__m256 __A, const int __R)
+{
+  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
+                                                   (__v8df)
+                                                   _mm512_undefined_pd (),
+                                                   (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundps_pd (__m512d __W, __mmask8 __U, __m256 __A,
+                           const int __R)
+{
+  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
+                                                   (__v8df) __W,
+                                                   (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundps_pd (__mmask8 __U, __m256 __A, const int __R)
+{
+  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
+                                                   (__v8df)
+                                                   _mm512_setzero_pd (),
+                                                   (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundph_ps (__m256i __A, const int __R)
+{
+  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
+                                                   (__v16sf)
+                                                   _mm512_undefined_ps (),
+                                                   (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundph_ps (__m512 __W, __mmask16 __U, __m256i __A,
+                           const int __R)
+{
+  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
+                                                   (__v16sf) __W,
+                                                   (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundph_ps (__mmask16 __U, __m256i __A, const int __R)
+{
+  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
+                                                   (__v16sf)
+                                                   _mm512_setzero_ps (),
+                                                   (__mmask16) __U, __R);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundps_ph (__m512 __A, const int __I)
+{
+  return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A,
+                                                    __I,
+                                                    (__v16hi)
+                                                    _mm256_undefined_si256 (),
+                                                    -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtps_ph (__m512 __A, const int __I)
+{
+  return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A,
+                                                    __I,
+                                                    (__v16hi)
+                                                    _mm256_undefined_si256 (),
+                                                    -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundps_ph (__m256i __U, __mmask16 __W, __m512 __A,
+                           const int __I)
+{
+  return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A,
+                                                    __I,
+                                                    (__v16hi) __U,
+                                                    (__mmask16) __W);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtps_ph (__m256i __U, __mmask16 __W, __m512 __A, const int __I)
+{
+  return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A,
+                                                    __I,
+                                                    (__v16hi) __U,
+                                                    (__mmask16) __W);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundps_ph (__mmask16 __W, __m512 __A, const int __I)
+{
+  return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A,
+                                                    __I,
+                                                    (__v16hi)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask16) __W);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtps_ph (__mmask16 __W, __m512 __A, const int __I)
+{
+  return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A,
+                                                    __I,
+                                                    (__v16hi)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask16) __W);
+}
+#else
+#define _mm512_cvt_roundps_pd(A, B)             \
+    (__m512d)__builtin_ia32_cvtps2pd512_mask(A, (__v8df)_mm512_undefined_pd(), -1, B)
+
+#define _mm512_mask_cvt_roundps_pd(W, U, A, B)   \
+    (__m512d)__builtin_ia32_cvtps2pd512_mask(A, (__v8df)(W), U, B)
+
+#define _mm512_maskz_cvt_roundps_pd(U, A, B)     \
+    (__m512d)__builtin_ia32_cvtps2pd512_mask(A, (__v8df)_mm512_setzero_pd(), U, B)
+
+#define _mm512_cvt_roundph_ps(A, B)             \
+    (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(A), (__v16sf)_mm512_undefined_ps(), -1, B)
+
+#define _mm512_mask_cvt_roundph_ps(W, U, A, B)   \
+    (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(A), (__v16sf)(W), U, B)
+
+#define _mm512_maskz_cvt_roundph_ps(U, A, B)     \
+    (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(A), (__v16sf)_mm512_setzero_ps(), U, B)
+
+#define _mm512_cvt_roundps_ph(A, I)                                             \
+  ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) (A), (int) (I),\
+    (__v16hi)_mm256_undefined_si256 (), -1))
+#define _mm512_cvtps_ph(A, I)                                           \
+  ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) (A), (int) (I),\
+    (__v16hi)_mm256_undefined_si256 (), -1))
+#define _mm512_mask_cvt_roundps_ph(U, W, A, I)                          \
+  ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) (A), (int) (I),\
+    (__v16hi)(__m256i)(U), (__mmask16) (W)))
+#define _mm512_mask_cvtps_ph(U, W, A, I)                                \
+  ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) (A), (int) (I),\
+    (__v16hi)(__m256i)(U), (__mmask16) (W)))
+#define _mm512_maskz_cvt_roundps_ph(W, A, I)                                    \
+  ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) (A), (int) (I),\
+    (__v16hi)_mm256_setzero_si256 (), (__mmask16) (W)))
+#define _mm512_maskz_cvtps_ph(W, A, I)                                  \
+  ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) (A), (int) (I),\
+    (__v16hi)_mm256_setzero_si256 (), (__mmask16) (W)))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundpd_ps (__m512d __A, const int __R)
+{
+  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
+                                                  (__v8sf)
+                                                  _mm256_undefined_ps (),
+                                                  (__mmask8) -1, __R);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundpd_ps (__m256 __W, __mmask8 __U, __m512d __A,
+                           const int __R)
+{
+  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
+                                                  (__v8sf) __W,
+                                                  (__mmask8) __U, __R);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundpd_ps (__mmask8 __U, __m512d __A, const int __R)
+{
+  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
+                                                  (__v8sf)
+                                                  _mm256_setzero_ps (),
+                                                  (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsd_ss (__m128 __A, __m128d __B, const int __R)
+{
+  return (__m128) __builtin_ia32_cvtsd2ss_round ((__v4sf) __A,
+                                                (__v2df) __B,
+                                                __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvt_roundsd_ss (__m128 __W, __mmask8 __U, __m128 __A,
+                        __m128d __B, const int __R)
+{
+  return (__m128) __builtin_ia32_cvtsd2ss_mask_round ((__v4sf) __A,
+                                                     (__v2df) __B,
+                                                     (__v4sf) __W,
+                                                     __U,
+                                                     __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvt_roundsd_ss (__mmask8 __U, __m128 __A,
+                        __m128d __B, const int __R)
+{
+  return (__m128) __builtin_ia32_cvtsd2ss_mask_round ((__v4sf) __A,
+                                                     (__v2df) __B,
+                                                     _mm_setzero_ps (),
+                                                     __U,
+                                                     __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundss_sd (__m128d __A, __m128 __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_cvtss2sd_round ((__v2df) __A,
+                                                 (__v4sf) __B,
+                                                 __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvt_roundss_sd (__m128d __W, __mmask8 __U, __m128d __A,
+                        __m128 __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_cvtss2sd_mask_round ((__v2df) __A,
+                                                      (__v4sf) __B,
+                                                      (__v2df) __W,
+                                                      __U,
+                                                      __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvt_roundss_sd (__mmask8 __U, __m128d __A,
+                         __m128 __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_cvtss2sd_mask_round ((__v2df) __A,
+                                                      (__v4sf) __B,
+                                                      _mm_setzero_pd (),
+                                                      __U,
+                                                      __R);
+}
+#else
+#define _mm512_cvt_roundpd_ps(A, B)             \
+    (__m256)__builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)_mm256_undefined_ps(), -1, B)
+
+#define _mm512_mask_cvt_roundpd_ps(W, U, A, B)   \
+    (__m256)__builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)(W), U, B)
+
+#define _mm512_maskz_cvt_roundpd_ps(U, A, B)     \
+    (__m256)__builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)_mm256_setzero_ps(), U, B)
+
+#define _mm_cvt_roundsd_ss(A, B, C)             \
+    (__m128)__builtin_ia32_cvtsd2ss_round(A, B, C)
+
+#define _mm_mask_cvt_roundsd_ss(W, U, A, B, C) \
+    (__m128)__builtin_ia32_cvtsd2ss_mask_round ((A), (B), (W), (U), (C))
+
+#define _mm_maskz_cvt_roundsd_ss(U, A, B, C)   \
+    (__m128)__builtin_ia32_cvtsd2ss_mask_round ((A), (B), _mm_setzero_ps (), \
+                                               (U), (C))
+
+#define _mm_cvt_roundss_sd(A, B, C)             \
+    (__m128d)__builtin_ia32_cvtss2sd_round(A, B, C)
+
+#define _mm_mask_cvt_roundss_sd(W, U, A, B, C) \
+    (__m128d)__builtin_ia32_cvtss2sd_mask_round ((A), (B), (W), (U), (C))
+
+#define _mm_maskz_cvt_roundss_sd(U, A, B, C)   \
+    (__m128d)__builtin_ia32_cvtss2sd_mask_round ((A), (B), _mm_setzero_pd (), \
+                                                (U), (C))
+
+#endif
+
+#define _mm_mask_cvtss_sd(W, U, A, B) \
+    _mm_mask_cvt_roundss_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_cvtss_sd(U, A, B) \
+    _mm_maskz_cvt_roundss_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_mask_cvtsd_ss(W, U, A, B) \
+    _mm_mask_cvt_roundsd_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_cvtsd_ss(U, A, B) \
+    _mm_maskz_cvt_roundsd_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION)
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_stream_si512 (__m512i * __P, __m512i __A)
+{
+  __builtin_ia32_movntdq512 ((__v8di *) __P, (__v8di) __A);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_stream_ps (float *__P, __m512 __A)
+{
+  __builtin_ia32_movntps512 (__P, (__v16sf) __A);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_stream_pd (double *__P, __m512d __A)
+{
+  __builtin_ia32_movntpd512 (__P, (__v8df) __A);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_stream_load_si512 (void *__P)
+{
+  return __builtin_ia32_movntdqa512 ((__v8di *)__P);
+}
+
+/* Constants for mantissa extraction */
+typedef enum
+{
+  _MM_MANT_NORM_1_2,           /* interval [1, 2)      */
+  _MM_MANT_NORM_p5_2,          /* interval [0.5, 2)    */
+  _MM_MANT_NORM_p5_1,          /* interval [0.5, 1)    */
+  _MM_MANT_NORM_p75_1p5                /* interval [0.75, 1.5) */
+} _MM_MANTISSA_NORM_ENUM;
+
+typedef enum
+{
+  _MM_MANT_SIGN_src,           /* sign = sign(SRC)     */
+  _MM_MANT_SIGN_zero,          /* sign = 0             */
+  _MM_MANT_SIGN_nan            /* DEST = NaN if sign(SRC) = 1 */
+} _MM_MANTISSA_SIGN_ENUM;
+
+#ifdef __OPTIMIZE__
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getexp_round_ss (__m128 __A, __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_getexpss128_round ((__v4sf) __A,
+                                                   (__v4sf) __B,
+                                                   __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getexp_round_ss (__m128 __W, __mmask8 __U, __m128 __A,
+                         __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_getexpss_mask_round ((__v4sf) __A,
+                                                (__v4sf) __B,
+                                                (__v4sf) __W,
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getexp_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
+                          const int __R)
+{
+  return (__m128) __builtin_ia32_getexpss_mask_round ((__v4sf) __A,
+                                                (__v4sf) __B,
+                                                (__v4sf)
+                                                _mm_setzero_ps (),
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getexp_round_sd (__m128d __A, __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_getexpsd128_round ((__v2df) __A,
+                                                    (__v2df) __B,
+                                                    __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getexp_round_sd (__m128d __W, __mmask8 __U, __m128d __A,
+                         __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_getexpsd_mask_round ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df) __W,
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getexp_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
+                          const int __R)
+{
+  return (__m128d) __builtin_ia32_getexpsd_mask_round ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df)
+                                                _mm_setzero_pd (),
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getexp_round_ps (__m512 __A, const int __R)
+{
+  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+                                                  (__v16sf)
+                                                  _mm512_undefined_ps (),
+                                                  (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getexp_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
+                            const int __R)
+{
+  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+                                                  (__v16sf) __W,
+                                                  (__mmask16) __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getexp_round_ps (__mmask16 __U, __m512 __A, const int __R)
+{
+  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+                                                  (__v16sf)
+                                                  _mm512_setzero_ps (),
+                                                  (__mmask16) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getexp_round_pd (__m512d __A, const int __R)
+{
+  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+                                                   (__v8df)
+                                                   _mm512_undefined_pd (),
+                                                   (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getexp_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
+                            const int __R)
+{
+  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+                                                   (__v8df) __W,
+                                                   (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getexp_round_pd (__mmask8 __U, __m512d __A, const int __R)
+{
+  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+                                                   (__v8df)
+                                                   _mm512_setzero_pd (),
+                                                   (__mmask8) __U, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getmant_round_pd (__m512d __A, _MM_MANTISSA_NORM_ENUM __B,
+                        _MM_MANTISSA_SIGN_ENUM __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A,
+                                                    (__C << 2) | __B,
+                                                    _mm512_undefined_pd (),
+                                                    (__mmask8) -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getmant_round_pd (__m512d __W, __mmask8 __U, __m512d __A,
+                             _MM_MANTISSA_NORM_ENUM __B,
+                             _MM_MANTISSA_SIGN_ENUM __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A,
+                                                    (__C << 2) | __B,
+                                                    (__v8df) __W, __U,
+                                                    __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getmant_round_pd (__mmask8 __U, __m512d __A,
+                              _MM_MANTISSA_NORM_ENUM __B,
+                              _MM_MANTISSA_SIGN_ENUM __C, const int __R)
+{
+  return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A,
+                                                    (__C << 2) | __B,
+                                                    (__v8df)
+                                                    _mm512_setzero_pd (),
+                                                    __U, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getmant_round_ps (__m512 __A, _MM_MANTISSA_NORM_ENUM __B,
+                        _MM_MANTISSA_SIGN_ENUM __C, const int __R)
+{
+  return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A,
+                                                   (__C << 2) | __B,
+                                                   _mm512_undefined_ps (),
+                                                   (__mmask16) -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getmant_round_ps (__m512 __W, __mmask16 __U, __m512 __A,
+                             _MM_MANTISSA_NORM_ENUM __B,
+                             _MM_MANTISSA_SIGN_ENUM __C, const int __R)
+{
+  return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A,
+                                                   (__C << 2) | __B,
+                                                   (__v16sf) __W, __U,
+                                                   __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getmant_round_ps (__mmask16 __U, __m512 __A,
+                              _MM_MANTISSA_NORM_ENUM __B,
+                              _MM_MANTISSA_SIGN_ENUM __C, const int __R)
+{
+  return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A,
+                                                   (__C << 2) | __B,
+                                                   (__v16sf)
+                                                   _mm512_setzero_ps (),
+                                                   __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getmant_round_sd (__m128d __A, __m128d __B,
+                     _MM_MANTISSA_NORM_ENUM __C,
+                     _MM_MANTISSA_SIGN_ENUM __D, const int __R)
+{
+  return (__m128d) __builtin_ia32_getmantsd_round ((__v2df) __A,
+                                                 (__v2df) __B,
+                                                 (__D << 2) | __C,
+                                                  __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getmant_round_sd (__m128d __W, __mmask8 __U, __m128d __A,
+                             __m128d __B, _MM_MANTISSA_NORM_ENUM __C,
+                             _MM_MANTISSA_SIGN_ENUM __D, const int __R)
+{
+  return (__m128d) __builtin_ia32_getmantsd_mask_round ((__v2df) __A,
+                                                   (__v2df) __B,
+                                                   (__D << 2) | __C,
+                                                    (__v2df) __W,
+                                                    __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getmant_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
+                              _MM_MANTISSA_NORM_ENUM __C,
+                              _MM_MANTISSA_SIGN_ENUM __D, const int __R)
+{
+  return (__m128d) __builtin_ia32_getmantsd_mask_round ((__v2df) __A,
+                                                       (__v2df) __B,
+                                                       (__D << 2) | __C,
+                                                        (__v2df)
+                                                        _mm_setzero_pd(),
+                                                       __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getmant_round_ss (__m128 __A, __m128 __B,
+                     _MM_MANTISSA_NORM_ENUM __C,
+                     _MM_MANTISSA_SIGN_ENUM __D, const int __R)
+{
+  return (__m128) __builtin_ia32_getmantss_round ((__v4sf) __A,
+                                                 (__v4sf) __B,
+                                                 (__D << 2) | __C,
+                                                 __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getmant_round_ss (__m128 __W, __mmask8 __U, __m128 __A,
+                             __m128 __B, _MM_MANTISSA_NORM_ENUM __C,
+                             _MM_MANTISSA_SIGN_ENUM __D, const int __R)
+{
+  return (__m128) __builtin_ia32_getmantss_mask_round ((__v4sf) __A,
+                                                   (__v4sf) __B,
+                                                   (__D << 2) | __C,
+                                                    (__v4sf) __W,
+                                                    __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getmant_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
+                              _MM_MANTISSA_NORM_ENUM __C,
+                              _MM_MANTISSA_SIGN_ENUM __D, const int __R)
+{
+  return (__m128) __builtin_ia32_getmantss_mask_round ((__v4sf) __A,
+                                                       (__v4sf) __B,
+                                                       (__D << 2) | __C,
+                                                        (__v4sf)
+                                                        _mm_setzero_ps(),
+                                                       __U, __R);
+}
+
+#else
+#define _mm512_getmant_round_pd(X, B, C, R)                                                  \
+  ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X),                 \
+                                              (int)(((C)<<2) | (B)),                \
+                                              (__v8df)(__m512d)_mm512_undefined_pd(), \
+                                              (__mmask8)-1,\
+                                             (R)))
+
+#define _mm512_mask_getmant_round_pd(W, U, X, B, C, R)                                       \
+  ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X),                 \
+                                              (int)(((C)<<2) | (B)),                \
+                                              (__v8df)(__m512d)(W),                 \
+                                              (__mmask8)(U),\
+                                             (R)))
+
+#define _mm512_maskz_getmant_round_pd(U, X, B, C, R)                                         \
+  ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X),                 \
+                                              (int)(((C)<<2) | (B)),                \
+                                              (__v8df)(__m512d)_mm512_setzero_pd(), \
+                                              (__mmask8)(U),\
+                                             (R)))
+#define _mm512_getmant_round_ps(X, B, C, R)                                                  \
+  ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X),                  \
+                                             (int)(((C)<<2) | (B)),                 \
+                                             (__v16sf)(__m512)_mm512_undefined_ps(), \
+                                             (__mmask16)-1,\
+                                            (R)))
+
+#define _mm512_mask_getmant_round_ps(W, U, X, B, C, R)                                       \
+  ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X),                  \
+                                             (int)(((C)<<2) | (B)),                 \
+                                             (__v16sf)(__m512)(W),                  \
+                                             (__mmask16)(U),\
+                                            (R)))
+
+#define _mm512_maskz_getmant_round_ps(U, X, B, C, R)                                         \
+  ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X),                  \
+                                             (int)(((C)<<2) | (B)),                 \
+                                             (__v16sf)(__m512)_mm512_setzero_ps(),  \
+                                             (__mmask16)(U),\
+                                            (R)))
+#define _mm_getmant_round_sd(X, Y, C, D, R)                                                  \
+  ((__m128d)__builtin_ia32_getmantsd_round ((__v2df)(__m128d)(X),                    \
+                                           (__v2df)(__m128d)(Y),       \
+                                           (int)(((D)<<2) | (C)),      \
+                                           (R)))
+
+#define _mm_mask_getmant_round_sd(W, U, X, Y, C, D, R)                                       \
+  ((__m128d)__builtin_ia32_getmantsd_mask_round ((__v2df)(__m128d)(X),                  \
+                                            (__v2df)(__m128d)(Y),                  \
+                                             (int)(((D)<<2) | (C)),                 \
+                                             (__v2df)(__m128d)(W),                   \
+                                             (__mmask8)(U),\
+                                            (R)))
+
+#define _mm_maskz_getmant_round_sd(U, X, Y, C, D, R)                                         \
+  ((__m128d)__builtin_ia32_getmantsd_mask_round ((__v2df)(__m128d)(X),                  \
+                                                 (__v2df)(__m128d)(Y),                  \
+                                             (int)(((D)<<2) | (C)),              \
+                                             (__v2df)(__m128d)_mm_setzero_pd(),  \
+                                             (__mmask8)(U),\
+                                            (R)))
+
+#define _mm_getmant_round_ss(X, Y, C, D, R)                                                  \
+  ((__m128)__builtin_ia32_getmantss_round ((__v4sf)(__m128)(X),                      \
+                                          (__v4sf)(__m128)(Y),         \
+                                          (int)(((D)<<2) | (C)),       \
+                                          (R)))
+
+#define _mm_mask_getmant_round_ss(W, U, X, Y, C, D, R)                                       \
+  ((__m128)__builtin_ia32_getmantss_mask_round ((__v4sf)(__m128)(X),                  \
+                                            (__v4sf)(__m128)(Y),                  \
+                                             (int)(((D)<<2) | (C)),                 \
+                                             (__v4sf)(__m128)(W),                   \
+                                             (__mmask8)(U),\
+                                            (R)))
+
+#define _mm_maskz_getmant_round_ss(U, X, Y, C, D, R)                                         \
+  ((__m128)__builtin_ia32_getmantss_mask_round ((__v4sf)(__m128)(X),                  \
+                                                 (__v4sf)(__m128)(Y),                  \
+                                             (int)(((D)<<2) | (C)),              \
+                                             (__v4sf)(__m128)_mm_setzero_ps(),  \
+                                             (__mmask8)(U),\
+                                            (R)))
+
+#define _mm_getexp_round_ss(A, B, R)                                                 \
+  ((__m128)__builtin_ia32_getexpss128_round((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), R))
+
+#define _mm_mask_getexp_round_ss(W, U, A, B, C) \
+    (__m128)__builtin_ia32_getexpss_mask_round(A, B, W, U, C)
+
+#define _mm_maskz_getexp_round_ss(U, A, B, C)   \
+    (__m128)__builtin_ia32_getexpss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C)
+
+#define _mm_getexp_round_sd(A, B, R)                                                  \
+  ((__m128d)__builtin_ia32_getexpsd128_round((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), R))
+
+#define _mm_mask_getexp_round_sd(W, U, A, B, C) \
+    (__m128d)__builtin_ia32_getexpsd_mask_round(A, B, W, U, C)
+
+#define _mm_maskz_getexp_round_sd(U, A, B, C)   \
+    (__m128d)__builtin_ia32_getexpsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C)
+
+
+#define _mm512_getexp_round_ps(A, R)                                           \
+  ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A),               \
+  (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, R))
+
+#define _mm512_mask_getexp_round_ps(W, U, A, R)                                        \
+  ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A),               \
+  (__v16sf)(__m512)(W), (__mmask16)(U), R))
+
+#define _mm512_maskz_getexp_round_ps(U, A, R)                                  \
+  ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A),               \
+  (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), R))
+
+#define _mm512_getexp_round_pd(A, R)                                           \
+  ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A),              \
+  (__v8df)_mm512_undefined_pd(), (__mmask8)-1, R))
+
+#define _mm512_mask_getexp_round_pd(W, U, A, R)                                        \
+  ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A),              \
+  (__v8df)(__m512d)(W), (__mmask8)(U), R))
+
+#define _mm512_maskz_getexp_round_pd(U, A, R)                                  \
+  ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A),              \
+  (__v8df)_mm512_setzero_pd(), (__mmask8)(U), R))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_roundscale_round_ps (__m512 __A, const int __imm, const int __R)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, __imm,
+                                                 (__v16sf)
+                                                 _mm512_undefined_ps (),
+                                                 -1, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_roundscale_round_ps (__m512 __A, __mmask16 __B, __m512 __C,
+                                const int __imm, const int __R)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __C, __imm,
+                                                 (__v16sf) __A,
+                                                 (__mmask16) __B, __R);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_roundscale_round_ps (__mmask16 __A, __m512 __B,
+                                 const int __imm, const int __R)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __B,
+                                                 __imm,
+                                                 (__v16sf)
+                                                 _mm512_setzero_ps (),
+                                                 (__mmask16) __A, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_roundscale_round_pd (__m512d __A, const int __imm, const int __R)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, __imm,
+                                                  (__v8df)
+                                                  _mm512_undefined_pd (),
+                                                  -1, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_roundscale_round_pd (__m512d __A, __mmask8 __B,
+                                __m512d __C, const int __imm, const int __R)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __C, __imm,
+                                                  (__v8df) __A,
+                                                  (__mmask8) __B, __R);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_roundscale_round_pd (__mmask8 __A, __m512d __B,
+                                 const int __imm, const int __R)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __B,
+                                                  __imm,
+                                                  (__v8df)
+                                                  _mm512_setzero_pd (),
+                                                  (__mmask8) __A, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roundscale_round_ss (__m128 __A, __m128 __B, const int __imm,
+                        const int __R)
+{
+  return (__m128)
+    __builtin_ia32_rndscaless_mask_round ((__v4sf) __A,
+                                         (__v4sf) __B, __imm,
+                                         (__v4sf)
+                                         _mm_setzero_ps (),
+                                         (__mmask8) -1,
+                                         __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_roundscale_round_ss (__m128 __A, __mmask8 __B, __m128 __C,
+                             __m128 __D, const int __imm, const int __R)
+{
+  return (__m128)
+    __builtin_ia32_rndscaless_mask_round ((__v4sf) __C,
+                                         (__v4sf) __D, __imm,
+                                         (__v4sf) __A,
+                                         (__mmask8) __B,
+                                         __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_roundscale_round_ss (__mmask8 __A, __m128 __B, __m128 __C,
+                              const int __imm, const int __R)
+{
+  return (__m128)
+    __builtin_ia32_rndscaless_mask_round ((__v4sf) __B,
+                                         (__v4sf) __C, __imm,
+                                         (__v4sf)
+                                         _mm_setzero_ps (),
+                                         (__mmask8) __A,
+                                         __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roundscale_round_sd (__m128d __A, __m128d __B, const int __imm,
+                        const int __R)
+{
+  return (__m128d)
+    __builtin_ia32_rndscalesd_mask_round ((__v2df) __A,
+                                         (__v2df) __B, __imm,
+                                         (__v2df)
+                                         _mm_setzero_pd (),
+                                         (__mmask8) -1,
+                                         __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_roundscale_round_sd (__m128d __A, __mmask8 __B, __m128d __C,
+                             __m128d __D, const int __imm, const int __R)
+{
+  return (__m128d)
+    __builtin_ia32_rndscalesd_mask_round ((__v2df) __C,
+                                         (__v2df) __D, __imm,
+                                         (__v2df) __A,
+                                         (__mmask8) __B,
+                                         __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_roundscale_round_sd (__mmask8 __A, __m128d __B, __m128d __C,
+                              const int __imm, const int __R)
+{
+  return (__m128d)
+    __builtin_ia32_rndscalesd_mask_round ((__v2df) __B,
+                                         (__v2df) __C, __imm,
+                                         (__v2df)
+                                         _mm_setzero_pd (),
+                                         (__mmask8) __A,
+                                         __R);
+}
+
+#else
+#define _mm512_roundscale_round_ps(A, B, R) \
+  ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(A), (int)(B),\
+    (__v16sf)_mm512_undefined_ps(), (__mmask16)(-1), R))
+#define _mm512_mask_roundscale_round_ps(A, B, C, D, R)                         \
+  ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(C),      \
+                                           (int)(D),                   \
+                                           (__v16sf)(__m512)(A),       \
+                                           (__mmask16)(B), R))
+#define _mm512_maskz_roundscale_round_ps(A, B, C, R)                           \
+  ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(B),      \
+                                           (int)(C),                   \
+                                           (__v16sf)_mm512_setzero_ps(),\
+                                           (__mmask16)(A), R))
+#define _mm512_roundscale_round_pd(A, B, R) \
+  ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(A), (int)(B),\
+    (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), R))
+#define _mm512_mask_roundscale_round_pd(A, B, C, D, R)                         \
+  ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(C),     \
+                                            (int)(D),                  \
+                                            (__v8df)(__m512d)(A),      \
+                                            (__mmask8)(B), R))
+#define _mm512_maskz_roundscale_round_pd(A, B, C, R)                           \
+  ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(B),     \
+                                            (int)(C),                  \
+                                            (__v8df)_mm512_setzero_pd(),\
+                                            (__mmask8)(A), R))
+#define _mm_roundscale_round_ss(A, B, I, R)                            \
+  ((__m128)                                                            \
+   __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (A),                \
+                                        (__v4sf) (__m128) (B),         \
+                                        (int) (I),                     \
+                                        (__v4sf) _mm_setzero_ps (),    \
+                                        (__mmask8) (-1),               \
+                                        (int) (R)))
+#define _mm_mask_roundscale_round_ss(A, U, B, C, I, R)         \
+  ((__m128)                                                    \
+   __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (B),        \
+                                        (__v4sf) (__m128) (C), \
+                                        (int) (I),             \
+                                        (__v4sf) (__m128) (A), \
+                                        (__mmask8) (U),        \
+                                        (int) (R)))
+#define _mm_maskz_roundscale_round_ss(U, A, B, I, R)                   \
+  ((__m128)                                                            \
+   __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (A),                \
+                                        (__v4sf) (__m128) (B),         \
+                                        (int) (I),                     \
+                                        (__v4sf) _mm_setzero_ps (),    \
+                                        (__mmask8) (U),                \
+                                        (int) (R)))
+#define _mm_roundscale_round_sd(A, B, I, R)                            \
+  ((__m128d)                                                           \
+   __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (A),       \
+                                        (__v2df) (__m128d) (B),        \
+                                        (int) (I),                     \
+                                        (__v2df) _mm_setzero_pd (),    \
+                                        (__mmask8) (-1),               \
+                                        (int) (R)))
+#define _mm_mask_roundscale_round_sd(A, U, B, C, I, R)                 \
+  ((__m128d)                                                           \
+   __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (B),       \
+                                        (__v2df) (__m128d) (C),        \
+                                        (int) (I),                     \
+                                        (__v2df) (__m128d) (A),        \
+                                        (__mmask8) (U),                \
+                                        (int) (R)))
+#define _mm_maskz_roundscale_round_sd(U, A, B, I, R)                   \
+  ((__m128d)                                                           \
+   __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (A),       \
+                                        (__v2df) (__m128d) (B),        \
+                                        (int) (I),                     \
+                                        (__v2df) _mm_setzero_pd (),    \
+                                        (__mmask8) (U),                \
+                                        (int) (R)))
+#endif
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_floor_ps (__m512 __A)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
+                                                 _MM_FROUND_FLOOR,
+                                                 (__v16sf) __A, -1,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_floor_pd (__m512d __A)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
+                                                  _MM_FROUND_FLOOR,
+                                                  (__v8df) __A, -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_ceil_ps (__m512 __A)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
+                                                 _MM_FROUND_CEIL,
+                                                 (__v16sf) __A, -1,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_ceil_pd (__m512d __A)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
+                                                  _MM_FROUND_CEIL,
+                                                  (__v8df) __A, -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
+                                                 _MM_FROUND_FLOOR,
+                                                 (__v16sf) __W, __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
+                                                  _MM_FROUND_FLOOR,
+                                                  (__v8df) __W, __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
+                                                 _MM_FROUND_CEIL,
+                                                 (__v16sf) __W, __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
+                                                  _MM_FROUND_CEIL,
+                                                  (__v8df) __W, __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_alignr_epi32 (__m512i __A, __m512i __B, const int __imm)
+{
+  return (__m512i) __builtin_ia32_alignd512_mask ((__v16si) __A,
+                                                 (__v16si) __B, __imm,
+                                                 (__v16si)
+                                                 _mm512_undefined_epi32 (),
+                                                 (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_alignr_epi32 (__m512i __W, __mmask16 __U, __m512i __A,
+                         __m512i __B, const int __imm)
+{
+  return (__m512i) __builtin_ia32_alignd512_mask ((__v16si) __A,
+                                                 (__v16si) __B, __imm,
+                                                 (__v16si) __W,
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_alignr_epi32 (__mmask16 __U, __m512i __A, __m512i __B,
+                          const int __imm)
+{
+  return (__m512i) __builtin_ia32_alignd512_mask ((__v16si) __A,
+                                                 (__v16si) __B, __imm,
+                                                 (__v16si)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_alignr_epi64 (__m512i __A, __m512i __B, const int __imm)
+{
+  return (__m512i) __builtin_ia32_alignq512_mask ((__v8di) __A,
+                                                 (__v8di) __B, __imm,
+                                                 (__v8di)
+                                                 _mm512_undefined_epi32 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_alignr_epi64 (__m512i __W, __mmask8 __U, __m512i __A,
+                         __m512i __B, const int __imm)
+{
+  return (__m512i) __builtin_ia32_alignq512_mask ((__v8di) __A,
+                                                 (__v8di) __B, __imm,
+                                                 (__v8di) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_alignr_epi64 (__mmask8 __U, __m512i __A, __m512i __B,
+                          const int __imm)
+{
+  return (__m512i) __builtin_ia32_alignq512_mask ((__v8di) __A,
+                                                 (__v8di) __B, __imm,
+                                                 (__v8di)
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask8) __U);
+}
+#else
+#define _mm512_alignr_epi32(X, Y, C)                                        \
+    ((__m512i)__builtin_ia32_alignd512_mask ((__v16si)(__m512i)(X),         \
+        (__v16si)(__m512i)(Y), (int)(C), (__v16si)_mm512_undefined_epi32 (),\
+        (__mmask16)-1))
+
+#define _mm512_mask_alignr_epi32(W, U, X, Y, C)                             \
+    ((__m512i)__builtin_ia32_alignd512_mask ((__v16si)(__m512i)(X),         \
+        (__v16si)(__m512i)(Y), (int)(C), (__v16si)(__m512i)(W),             \
+        (__mmask16)(U)))
+
+#define _mm512_maskz_alignr_epi32(U, X, Y, C)                               \
+    ((__m512i)__builtin_ia32_alignd512_mask ((__v16si)(__m512i)(X),         \
+        (__v16si)(__m512i)(Y), (int)(C), (__v16si)_mm512_setzero_si512 (),\
+        (__mmask16)(U)))
+
+#define _mm512_alignr_epi64(X, Y, C)                                        \
+    ((__m512i)__builtin_ia32_alignq512_mask ((__v8di)(__m512i)(X),          \
+        (__v8di)(__m512i)(Y), (int)(C), (__v8di)_mm512_undefined_epi32 (),  \
+       (__mmask8)-1))
+
+#define _mm512_mask_alignr_epi64(W, U, X, Y, C)                             \
+    ((__m512i)__builtin_ia32_alignq512_mask ((__v8di)(__m512i)(X),          \
+        (__v8di)(__m512i)(Y), (int)(C), (__v8di)(__m512i)(W), (__mmask8)(U)))
+
+#define _mm512_maskz_alignr_epi64(U, X, Y, C)                               \
+    ((__m512i)__builtin_ia32_alignq512_mask ((__v8di)(__m512i)(X),          \
+        (__v8di)(__m512i)(Y), (int)(C), (__v8di)_mm512_setzero_si512 (),\
+        (__mmask8)(U)))
+#endif
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpeq_epi32_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_pcmpeqd512_mask ((__v16si) __A,
+                                                    (__v16si) __B,
+                                                    (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpeq_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_pcmpeqd512_mask ((__v16si) __A,
+                                                    (__v16si) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpeq_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_pcmpeqq512_mask ((__v8di) __A,
+                                                   (__v8di) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpeq_epi64_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_pcmpeqq512_mask ((__v8di) __A,
+                                                   (__v8di) __B,
+                                                   (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpgt_epi32_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_pcmpgtd512_mask ((__v16si) __A,
+                                                    (__v16si) __B,
+                                                    (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpgt_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_pcmpgtd512_mask ((__v16si) __A,
+                                                    (__v16si) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpgt_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_pcmpgtq512_mask ((__v8di) __A,
+                                                   (__v8di) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpgt_epi64_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_pcmpgtq512_mask ((__v8di) __A,
+                                                   (__v8di) __B,
+                                                   (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpge_epi32_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
+                                                   (__v16si) __Y, 5,
+                                                   (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpge_epi32_mask (__mmask16 __M, __m512i __X, __m512i __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
+                                                   (__v16si) __Y, 5,
+                                                   (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpge_epu32_mask (__mmask16 __M, __m512i __X, __m512i __Y)
+{
+  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
+                                                   (__v16si) __Y, 5,
+                                                   (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpge_epu32_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
+                                                   (__v16si) __Y, 5,
+                                                   (__mmask16) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpge_epi64_mask (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
+                                                   (__v8di) __Y, 5,
+                                                   (__mmask8) __M);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpge_epi64_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
+                                                   (__v8di) __Y, 5,
+                                                   (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpge_epu64_mask (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
+                                                   (__v8di) __Y, 5,
+                                                   (__mmask8) __M);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpge_epu64_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
+                                                   (__v8di) __Y, 5,
+                                                   (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmple_epi32_mask (__mmask16 __M, __m512i __X, __m512i __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
+                                                   (__v16si) __Y, 2,
+                                                   (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmple_epi32_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
+                                                   (__v16si) __Y, 2,
+                                                   (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmple_epu32_mask (__mmask16 __M, __m512i __X, __m512i __Y)
+{
+  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
+                                                   (__v16si) __Y, 2,
+                                                   (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmple_epu32_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
+                                                   (__v16si) __Y, 2,
+                                                   (__mmask16) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmple_epi64_mask (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
+                                                   (__v8di) __Y, 2,
+                                                   (__mmask8) __M);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmple_epi64_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
+                                                   (__v8di) __Y, 2,
+                                                   (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmple_epu64_mask (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
+                                                   (__v8di) __Y, 2,
+                                                   (__mmask8) __M);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmple_epu64_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
+                                                   (__v8di) __Y, 2,
+                                                   (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmplt_epi32_mask (__mmask16 __M, __m512i __X, __m512i __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
+                                                   (__v16si) __Y, 1,
+                                                   (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmplt_epi32_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
+                                                   (__v16si) __Y, 1,
+                                                   (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmplt_epu32_mask (__mmask16 __M, __m512i __X, __m512i __Y)
+{
+  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
+                                                   (__v16si) __Y, 1,
+                                                   (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmplt_epu32_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
+                                                   (__v16si) __Y, 1,
+                                                   (__mmask16) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmplt_epi64_mask (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
+                                                   (__v8di) __Y, 1,
+                                                   (__mmask8) __M);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmplt_epi64_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
+                                                   (__v8di) __Y, 1,
+                                                   (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmplt_epu64_mask (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
+                                                   (__v8di) __Y, 1,
+                                                   (__mmask8) __M);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmplt_epu64_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
+                                                   (__v8di) __Y, 1,
+                                                   (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpneq_epi32_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
+                                                   (__v16si) __Y, 4,
+                                                   (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpneq_epi32_mask (__mmask16 __M, __m512i __X, __m512i __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
+                                                   (__v16si) __Y, 4,
+                                                   (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpneq_epu32_mask (__mmask16 __M, __m512i __X, __m512i __Y)
+{
+  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
+                                                   (__v16si) __Y, 4,
+                                                   (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpneq_epu32_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
+                                                   (__v16si) __Y, 4,
+                                                   (__mmask16) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpneq_epi64_mask (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
+                                                   (__v8di) __Y, 4,
+                                                   (__mmask8) __M);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpneq_epi64_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
+                                                   (__v8di) __Y, 4,
+                                                   (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpneq_epu64_mask (__mmask8 __M, __m512i __X, __m512i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
+                                                   (__v8di) __Y, 4,
+                                                   (__mmask8) __M);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpneq_epu64_mask (__m512i __X, __m512i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
+                                                   (__v8di) __Y, 4,
+                                                   (__mmask8) -1);
+}
+
+#define _MM_CMPINT_EQ      0x0
+#define _MM_CMPINT_LT      0x1
+#define _MM_CMPINT_LE      0x2
+#define _MM_CMPINT_UNUSED   0x3
+#define _MM_CMPINT_NE      0x4
+#define _MM_CMPINT_NLT     0x5
+#define _MM_CMPINT_GE      0x5
+#define _MM_CMPINT_NLE     0x6
+#define _MM_CMPINT_GT      0x6
+
+#ifdef __OPTIMIZE__
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kshiftli_mask16 (__mmask16 __A, unsigned int __B)
+{
+  return (__mmask16) __builtin_ia32_kshiftlihi ((__mmask16) __A,
+                                               (__mmask8) __B);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kshiftri_mask16 (__mmask16 __A, unsigned int __B)
+{
+  return (__mmask16) __builtin_ia32_kshiftrihi ((__mmask16) __A,
+                                               (__mmask8) __B);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_epi64_mask (__m512i __X, __m512i __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
+                                                (__v8di) __Y, __P,
+                                                (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_epi32_mask (__m512i __X, __m512i __Y, const int __P)
+{
+  return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
+                                                 (__v16si) __Y, __P,
+                                                 (__mmask16) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_epu64_mask (__m512i __X, __m512i __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
+                                                 (__v8di) __Y, __P,
+                                                 (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_epu32_mask (__m512i __X, __m512i __Y, const int __P)
+{
+  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
+                                                  (__v16si) __Y, __P,
+                                                  (__mmask16) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_round_pd_mask (__m512d __X, __m512d __Y, const int __P,
+                         const int __R)
+{
+  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+                                                 (__v8df) __Y, __P,
+                                                 (__mmask8) -1, __R);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_round_ps_mask (__m512 __X, __m512 __Y, const int __P, const int __R)
+{
+  return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+                                                  (__v16sf) __Y, __P,
+                                                  (__mmask16) -1, __R);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_epi64_mask (__mmask8 __U, __m512i __X, __m512i __Y,
+                           const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X,
+                                                (__v8di) __Y, __P,
+                                                (__mmask8) __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_epi32_mask (__mmask16 __U, __m512i __X, __m512i __Y,
+                           const int __P)
+{
+  return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X,
+                                                 (__v16si) __Y, __P,
+                                                 (__mmask16) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_epu64_mask (__mmask8 __U, __m512i __X, __m512i __Y,
+                           const int __P)
+{
+  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X,
+                                                 (__v8di) __Y, __P,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_epu32_mask (__mmask16 __U, __m512i __X, __m512i __Y,
+                           const int __P)
+{
+  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X,
+                                                  (__v16si) __Y, __P,
+                                                  (__mmask16) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_round_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y,
+                              const int __P, const int __R)
+{
+  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+                                                 (__v8df) __Y, __P,
+                                                 (__mmask8) __U, __R);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_round_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y,
+                              const int __P, const int __R)
+{
+  return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+                                                  (__v16sf) __Y, __P,
+                                                  (__mmask16) __U, __R);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_round_sd_mask (__m128d __X, __m128d __Y, const int __P, const int __R)
+{
+  return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X,
+                                              (__v2df) __Y, __P,
+                                              (__mmask8) -1, __R);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_round_sd_mask (__mmask8 __M, __m128d __X, __m128d __Y,
+                           const int __P, const int __R)
+{
+  return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X,
+                                              (__v2df) __Y, __P,
+                                              (__mmask8) __M, __R);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_round_ss_mask (__m128 __X, __m128 __Y, const int __P, const int __R)
+{
+  return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X,
+                                              (__v4sf) __Y, __P,
+                                              (__mmask8) -1, __R);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_round_ss_mask (__mmask8 __M, __m128 __X, __m128 __Y,
+                           const int __P, const int __R)
+{
+  return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X,
+                                              (__v4sf) __Y, __P,
+                                              (__mmask8) __M, __R);
+}
+
+#else
+#define _kshiftli_mask16(X, Y)                                         \
+  ((__mmask16) __builtin_ia32_kshiftlihi ((__mmask16)(X), (__mmask8)(Y)))
+
+#define _kshiftri_mask16(X, Y)                                         \
+  ((__mmask16) __builtin_ia32_kshiftrihi ((__mmask16)(X), (__mmask8)(Y)))
+
+#define _mm512_cmp_epi64_mask(X, Y, P)                                 \
+  ((__mmask8) __builtin_ia32_cmpq512_mask ((__v8di)(__m512i)(X),       \
+                                          (__v8di)(__m512i)(Y), (int)(P),\
+                                          (__mmask8)-1))
+
+#define _mm512_cmp_epi32_mask(X, Y, P)                                 \
+  ((__mmask16) __builtin_ia32_cmpd512_mask ((__v16si)(__m512i)(X),     \
+                                           (__v16si)(__m512i)(Y), (int)(P), \
+                                           (__mmask16)-1))
+
+#define _mm512_cmp_epu64_mask(X, Y, P)                                 \
+  ((__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di)(__m512i)(X),      \
+                                           (__v8di)(__m512i)(Y), (int)(P),\
+                                           (__mmask8)-1))
+
+#define _mm512_cmp_epu32_mask(X, Y, P)                                 \
+  ((__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si)(__m512i)(X),    \
+                                            (__v16si)(__m512i)(Y), (int)(P), \
+                                            (__mmask16)-1))
+
+#define _mm512_cmp_round_pd_mask(X, Y, P, R)                           \
+  ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X),      \
+                                           (__v8df)(__m512d)(Y), (int)(P),\
+                                           (__mmask8)-1, R))
+
+#define _mm512_cmp_round_ps_mask(X, Y, P, R)                           \
+  ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X),     \
+                                            (__v16sf)(__m512)(Y), (int)(P),\
+                                            (__mmask16)-1, R))
+
+#define _mm512_mask_cmp_epi64_mask(M, X, Y, P)                         \
+  ((__mmask8) __builtin_ia32_cmpq512_mask ((__v8di)(__m512i)(X),       \
+                                          (__v8di)(__m512i)(Y), (int)(P),\
+                                          (__mmask8)(M)))
+
+#define _mm512_mask_cmp_epi32_mask(M, X, Y, P)                         \
+  ((__mmask16) __builtin_ia32_cmpd512_mask ((__v16si)(__m512i)(X),     \
+                                           (__v16si)(__m512i)(Y), (int)(P), \
+                                           (__mmask16)(M)))
+
+#define _mm512_mask_cmp_epu64_mask(M, X, Y, P)                         \
+  ((__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di)(__m512i)(X),      \
+                                           (__v8di)(__m512i)(Y), (int)(P),\
+                                           (__mmask8)(M)))
+
+#define _mm512_mask_cmp_epu32_mask(M, X, Y, P)                         \
+  ((__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si)(__m512i)(X),    \
+                                            (__v16si)(__m512i)(Y), (int)(P), \
+                                            (__mmask16)(M)))
+
+#define _mm512_mask_cmp_round_pd_mask(M, X, Y, P, R)                   \
+  ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X),      \
+                                           (__v8df)(__m512d)(Y), (int)(P),\
+                                           (__mmask8)(M), R))
+
+#define _mm512_mask_cmp_round_ps_mask(M, X, Y, P, R)                   \
+  ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X),     \
+                                            (__v16sf)(__m512)(Y), (int)(P),\
+                                            (__mmask16)(M), R))
+
+#define _mm_cmp_round_sd_mask(X, Y, P, R)                              \
+  ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X),         \
+                                        (__v2df)(__m128d)(Y), (int)(P),\
+                                        (__mmask8)-1, R))
+
+#define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R)                      \
+  ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X),         \
+                                        (__v2df)(__m128d)(Y), (int)(P),\
+                                        (M), R))
+
+#define _mm_cmp_round_ss_mask(X, Y, P, R)                              \
+  ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X),          \
+                                        (__v4sf)(__m128)(Y), (int)(P), \
+                                        (__mmask8)-1, R))
+
+#define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R)                      \
+  ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X),          \
+                                        (__v4sf)(__m128)(Y), (int)(P), \
+                                        (M), R))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i32gather_ps (__m512i __index, void const *__addr, int __scale)
+{
+  __m512 __v1_old = _mm512_undefined_ps ();
+  __mmask16 __mask = 0xFFFF;
+
+  return (__m512) __builtin_ia32_gathersiv16sf ((__v16sf) __v1_old,
+                                               __addr,
+                                               (__v16si) __index,
+                                               __mask, __scale);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i32gather_ps (__m512 __v1_old, __mmask16 __mask,
+                         __m512i __index, void const *__addr, int __scale)
+{
+  return (__m512) __builtin_ia32_gathersiv16sf ((__v16sf) __v1_old,
+                                               __addr,
+                                               (__v16si) __index,
+                                               __mask, __scale);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i32gather_pd (__m256i __index, void const *__addr, int __scale)
+{
+  __m512d __v1_old = _mm512_undefined_pd ();
+  __mmask8 __mask = 0xFF;
+
+  return (__m512d) __builtin_ia32_gathersiv8df ((__v8df) __v1_old,
+                                               __addr,
+                                               (__v8si) __index, __mask,
+                                               __scale);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i32gather_pd (__m512d __v1_old, __mmask8 __mask,
+                         __m256i __index, void const *__addr, int __scale)
+{
+  return (__m512d) __builtin_ia32_gathersiv8df ((__v8df) __v1_old,
+                                               __addr,
+                                               (__v8si) __index,
+                                               __mask, __scale);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i64gather_ps (__m512i __index, void const *__addr, int __scale)
+{
+  __m256 __v1_old = _mm256_undefined_ps ();
+  __mmask8 __mask = 0xFF;
+
+  return (__m256) __builtin_ia32_gatherdiv16sf ((__v8sf) __v1_old,
+                                               __addr,
+                                               (__v8di) __index, __mask,
+                                               __scale);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i64gather_ps (__m256 __v1_old, __mmask8 __mask,
+                         __m512i __index, void const *__addr, int __scale)
+{
+  return (__m256) __builtin_ia32_gatherdiv16sf ((__v8sf) __v1_old,
+                                               __addr,
+                                               (__v8di) __index,
+                                               __mask, __scale);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i64gather_pd (__m512i __index, void const *__addr, int __scale)
+{
+  __m512d __v1_old = _mm512_undefined_pd ();
+  __mmask8 __mask = 0xFF;
+
+  return (__m512d) __builtin_ia32_gatherdiv8df ((__v8df) __v1_old,
+                                               __addr,
+                                               (__v8di) __index, __mask,
+                                               __scale);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i64gather_pd (__m512d __v1_old, __mmask8 __mask,
+                         __m512i __index, void const *__addr, int __scale)
+{
+  return (__m512d) __builtin_ia32_gatherdiv8df ((__v8df) __v1_old,
+                                               __addr,
+                                               (__v8di) __index,
+                                               __mask, __scale);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i32gather_epi32 (__m512i __index, void const *__addr, int __scale)
+{
+  __m512i __v1_old = _mm512_undefined_epi32 ();
+  __mmask16 __mask = 0xFFFF;
+
+  return (__m512i) __builtin_ia32_gathersiv16si ((__v16si) __v1_old,
+                                                __addr,
+                                                (__v16si) __index,
+                                                __mask, __scale);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i32gather_epi32 (__m512i __v1_old, __mmask16 __mask,
+                            __m512i __index, void const *__addr, int __scale)
+{
+  return (__m512i) __builtin_ia32_gathersiv16si ((__v16si) __v1_old,
+                                                __addr,
+                                                (__v16si) __index,
+                                                __mask, __scale);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i32gather_epi64 (__m256i __index, void const *__addr, int __scale)
+{
+  __m512i __v1_old = _mm512_undefined_epi32 ();
+  __mmask8 __mask = 0xFF;
+
+  return (__m512i) __builtin_ia32_gathersiv8di ((__v8di) __v1_old,
+                                               __addr,
+                                               (__v8si) __index, __mask,
+                                               __scale);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i32gather_epi64 (__m512i __v1_old, __mmask8 __mask,
+                            __m256i __index, void const *__addr,
+                            int __scale)
+{
+  return (__m512i) __builtin_ia32_gathersiv8di ((__v8di) __v1_old,
+                                               __addr,
+                                               (__v8si) __index,
+                                               __mask, __scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i64gather_epi32 (__m512i __index, void const *__addr, int __scale)
+{
+  __m256i __v1_old = _mm256_undefined_si256 ();
+  __mmask8 __mask = 0xFF;
+
+  return (__m256i) __builtin_ia32_gatherdiv16si ((__v8si) __v1_old,
+                                                __addr,
+                                                (__v8di) __index,
+                                                __mask, __scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i64gather_epi32 (__m256i __v1_old, __mmask8 __mask,
+                            __m512i __index, void const *__addr, int __scale)
+{
+  return (__m256i) __builtin_ia32_gatherdiv16si ((__v8si) __v1_old,
+                                                __addr,
+                                                (__v8di) __index,
+                                                __mask, __scale);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i64gather_epi64 (__m512i __index, void const *__addr, int __scale)
+{
+  __m512i __v1_old = _mm512_undefined_epi32 ();
+  __mmask8 __mask = 0xFF;
+
+  return (__m512i) __builtin_ia32_gatherdiv8di ((__v8di) __v1_old,
+                                               __addr,
+                                               (__v8di) __index, __mask,
+                                               __scale);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i64gather_epi64 (__m512i __v1_old, __mmask8 __mask,
+                            __m512i __index, void const *__addr,
+                            int __scale)
+{
+  return (__m512i) __builtin_ia32_gatherdiv8di ((__v8di) __v1_old,
+                                               __addr,
+                                               (__v8di) __index,
+                                               __mask, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i32scatter_ps (void *__addr, __m512i __index, __m512 __v1, int __scale)
+{
+  __builtin_ia32_scattersiv16sf (__addr, (__mmask16) 0xFFFF,
+                                (__v16si) __index, (__v16sf) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i32scatter_ps (void *__addr, __mmask16 __mask,
+                          __m512i __index, __m512 __v1, int __scale)
+{
+  __builtin_ia32_scattersiv16sf (__addr, __mask, (__v16si) __index,
+                                (__v16sf) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i32scatter_pd (void *__addr, __m256i __index, __m512d __v1,
+                     int __scale)
+{
+  __builtin_ia32_scattersiv8df (__addr, (__mmask8) 0xFF,
+                               (__v8si) __index, (__v8df) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i32scatter_pd (void *__addr, __mmask8 __mask,
+                          __m256i __index, __m512d __v1, int __scale)
+{
+  __builtin_ia32_scattersiv8df (__addr, __mask, (__v8si) __index,
+                               (__v8df) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i64scatter_ps (void *__addr, __m512i __index, __m256 __v1, int __scale)
+{
+  __builtin_ia32_scatterdiv16sf (__addr, (__mmask8) 0xFF,
+                                (__v8di) __index, (__v8sf) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i64scatter_ps (void *__addr, __mmask8 __mask,
+                          __m512i __index, __m256 __v1, int __scale)
+{
+  __builtin_ia32_scatterdiv16sf (__addr, __mask, (__v8di) __index,
+                                (__v8sf) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i64scatter_pd (void *__addr, __m512i __index, __m512d __v1,
+                     int __scale)
+{
+  __builtin_ia32_scatterdiv8df (__addr, (__mmask8) 0xFF,
+                               (__v8di) __index, (__v8df) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i64scatter_pd (void *__addr, __mmask8 __mask,
+                          __m512i __index, __m512d __v1, int __scale)
+{
+  __builtin_ia32_scatterdiv8df (__addr, __mask, (__v8di) __index,
+                               (__v8df) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i32scatter_epi32 (void *__addr, __m512i __index,
+                        __m512i __v1, int __scale)
+{
+  __builtin_ia32_scattersiv16si (__addr, (__mmask16) 0xFFFF,
+                                (__v16si) __index, (__v16si) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i32scatter_epi32 (void *__addr, __mmask16 __mask,
+                             __m512i __index, __m512i __v1, int __scale)
+{
+  __builtin_ia32_scattersiv16si (__addr, __mask, (__v16si) __index,
+                                (__v16si) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i32scatter_epi64 (void *__addr, __m256i __index,
+                        __m512i __v1, int __scale)
+{
+  __builtin_ia32_scattersiv8di (__addr, (__mmask8) 0xFF,
+                               (__v8si) __index, (__v8di) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i32scatter_epi64 (void *__addr, __mmask8 __mask,
+                             __m256i __index, __m512i __v1, int __scale)
+{
+  __builtin_ia32_scattersiv8di (__addr, __mask, (__v8si) __index,
+                               (__v8di) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i64scatter_epi32 (void *__addr, __m512i __index,
+                        __m256i __v1, int __scale)
+{
+  __builtin_ia32_scatterdiv16si (__addr, (__mmask8) 0xFF,
+                                (__v8di) __index, (__v8si) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i64scatter_epi32 (void *__addr, __mmask8 __mask,
+                             __m512i __index, __m256i __v1, int __scale)
+{
+  __builtin_ia32_scatterdiv16si (__addr, __mask, (__v8di) __index,
+                                (__v8si) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_i64scatter_epi64 (void *__addr, __m512i __index,
+                        __m512i __v1, int __scale)
+{
+  __builtin_ia32_scatterdiv8di (__addr, (__mmask8) 0xFF,
+                               (__v8di) __index, (__v8di) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_i64scatter_epi64 (void *__addr, __mmask8 __mask,
+                             __m512i __index, __m512i __v1, int __scale)
+{
+  __builtin_ia32_scatterdiv8di (__addr, __mask, (__v8di) __index,
+                               (__v8di) __v1, __scale);
+}
+#else
+#define _mm512_i32gather_ps(INDEX, ADDR, SCALE)                                \
+  (__m512) __builtin_ia32_gathersiv16sf ((__v16sf)_mm512_undefined_ps(),\
+                                        (void const *) (ADDR),         \
+                                        (__v16si)(__m512i) (INDEX),    \
+                                        (__mmask16)0xFFFF,             \
+                                        (int) (SCALE))
+
+#define _mm512_mask_i32gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE)      \
+  (__m512) __builtin_ia32_gathersiv16sf ((__v16sf)(__m512) (V1OLD),    \
+                                        (void const *) (ADDR),         \
+                                        (__v16si)(__m512i) (INDEX),    \
+                                        (__mmask16) (MASK),            \
+                                        (int) (SCALE))
+
+#define _mm512_i32gather_pd(INDEX, ADDR, SCALE)                                \
+  (__m512d) __builtin_ia32_gathersiv8df ((__v8df)_mm512_undefined_pd(),        \
+                                        (void const *) (ADDR),         \
+                                        (__v8si)(__m256i) (INDEX),     \
+                                        (__mmask8)0xFF, (int) (SCALE))
+
+#define _mm512_mask_i32gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE)      \
+  (__m512d) __builtin_ia32_gathersiv8df ((__v8df)(__m512d) (V1OLD),    \
+                                        (void const *) (ADDR),         \
+                                        (__v8si)(__m256i) (INDEX),     \
+                                        (__mmask8) (MASK),             \
+                                        (int) (SCALE))
+
+#define _mm512_i64gather_ps(INDEX, ADDR, SCALE)                                \
+  (__m256) __builtin_ia32_gatherdiv16sf ((__v8sf)_mm256_undefined_ps(),        \
+                                        (void const *) (ADDR),         \
+                                        (__v8di)(__m512i) (INDEX),     \
+                                        (__mmask8)0xFF, (int) (SCALE))
+
+#define _mm512_mask_i64gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE)      \
+  (__m256) __builtin_ia32_gatherdiv16sf ((__v8sf)(__m256) (V1OLD),     \
+                                        (void const *) (ADDR),         \
+                                        (__v8di)(__m512i) (INDEX),     \
+                                        (__mmask8) (MASK),             \
+                                        (int) (SCALE))
+
+#define _mm512_i64gather_pd(INDEX, ADDR, SCALE)                                \
+  (__m512d) __builtin_ia32_gatherdiv8df ((__v8df)_mm512_undefined_pd(),        \
+                                        (void const *) (ADDR),         \
+                                        (__v8di)(__m512i) (INDEX),     \
+                                        (__mmask8)0xFF, (int) (SCALE))
+
+#define _mm512_mask_i64gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE)      \
+  (__m512d) __builtin_ia32_gatherdiv8df ((__v8df)(__m512d) (V1OLD),    \
+                                        (void const *) (ADDR),         \
+                                        (__v8di)(__m512i) (INDEX),     \
+                                        (__mmask8) (MASK),             \
+                                        (int) (SCALE))
+
+#define _mm512_i32gather_epi32(INDEX, ADDR, SCALE)                     \
+  (__m512i) __builtin_ia32_gathersiv16si ((__v16si)_mm512_undefined_epi32 (),\
+                                         (void const *) (ADDR),        \
+                                         (__v16si)(__m512i) (INDEX),   \
+                                         (__mmask16)0xFFFF,            \
+                                         (int) (SCALE))
+
+#define _mm512_mask_i32gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE)   \
+  (__m512i) __builtin_ia32_gathersiv16si ((__v16si)(__m512i) (V1OLD),  \
+                                         (void const *) (ADDR),        \
+                                         (__v16si)(__m512i) (INDEX),   \
+                                         (__mmask16) (MASK),           \
+                                         (int) (SCALE))
+
+#define _mm512_i32gather_epi64(INDEX, ADDR, SCALE)                     \
+  (__m512i) __builtin_ia32_gathersiv8di ((__v8di)_mm512_undefined_epi32 (),\
+                                        (void const *) (ADDR),         \
+                                        (__v8si)(__m256i) (INDEX),     \
+                                        (__mmask8)0xFF, (int) (SCALE))
+
+#define _mm512_mask_i32gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE)   \
+  (__m512i) __builtin_ia32_gathersiv8di ((__v8di)(__m512i) (V1OLD),    \
+                                        (void const *) (ADDR),         \
+                                        (__v8si)(__m256i) (INDEX),     \
+                                        (__mmask8) (MASK),             \
+                                        (int) (SCALE))
+
+#define _mm512_i64gather_epi32(INDEX, ADDR, SCALE)                        \
+  (__m256i) __builtin_ia32_gatherdiv16si ((__v8si)_mm256_undefined_si256(),\
+                                         (void const *) (ADDR),           \
+                                         (__v8di)(__m512i) (INDEX),       \
+                                         (__mmask8)0xFF, (int) (SCALE))
+
+#define _mm512_mask_i64gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE)   \
+  (__m256i) __builtin_ia32_gatherdiv16si ((__v8si)(__m256i) (V1OLD),   \
+                                         (void const *) (ADDR),        \
+                                         (__v8di)(__m512i) (INDEX),    \
+                                         (__mmask8) (MASK),            \
+                                         (int) (SCALE))
+
+#define _mm512_i64gather_epi64(INDEX, ADDR, SCALE)                     \
+  (__m512i) __builtin_ia32_gatherdiv8di ((__v8di)_mm512_undefined_epi32 (),\
+                                        (void const *) (ADDR),         \
+                                        (__v8di)(__m512i) (INDEX),     \
+                                        (__mmask8)0xFF, (int) (SCALE))
+
+#define _mm512_mask_i64gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE)   \
+  (__m512i) __builtin_ia32_gatherdiv8di ((__v8di)(__m512i) (V1OLD),    \
+                                        (void const *) (ADDR),         \
+                                        (__v8di)(__m512i) (INDEX),     \
+                                        (__mmask8) (MASK),             \
+                                        (int) (SCALE))
+
+#define _mm512_i32scatter_ps(ADDR, INDEX, V1, SCALE)                   \
+  __builtin_ia32_scattersiv16sf ((void *) (ADDR), (__mmask16)0xFFFF,   \
+                                (__v16si)(__m512i) (INDEX),            \
+                                (__v16sf)(__m512) (V1), (int) (SCALE))
+
+#define _mm512_mask_i32scatter_ps(ADDR, MASK, INDEX, V1, SCALE)                \
+  __builtin_ia32_scattersiv16sf ((void *) (ADDR), (__mmask16) (MASK),  \
+                                (__v16si)(__m512i) (INDEX),            \
+                                (__v16sf)(__m512) (V1), (int) (SCALE))
+
+#define _mm512_i32scatter_pd(ADDR, INDEX, V1, SCALE)                   \
+  __builtin_ia32_scattersiv8df ((void *) (ADDR), (__mmask8)0xFF,       \
+                               (__v8si)(__m256i) (INDEX),              \
+                               (__v8df)(__m512d) (V1), (int) (SCALE))
+
+#define _mm512_mask_i32scatter_pd(ADDR, MASK, INDEX, V1, SCALE)                \
+  __builtin_ia32_scattersiv8df ((void *) (ADDR), (__mmask8) (MASK),    \
+                               (__v8si)(__m256i) (INDEX),              \
+                               (__v8df)(__m512d) (V1), (int) (SCALE))
+
+#define _mm512_i64scatter_ps(ADDR, INDEX, V1, SCALE)                   \
+  __builtin_ia32_scatterdiv16sf ((void *) (ADDR), (__mmask8)0xFF,      \
+                                (__v8di)(__m512i) (INDEX),             \
+                                (__v8sf)(__m256) (V1), (int) (SCALE))
+
+#define _mm512_mask_i64scatter_ps(ADDR, MASK, INDEX, V1, SCALE)                \
+  __builtin_ia32_scatterdiv16sf ((void *) (ADDR), (__mmask16) (MASK),  \
+                                (__v8di)(__m512i) (INDEX),             \
+                                (__v8sf)(__m256) (V1), (int) (SCALE))
+
+#define _mm512_i64scatter_pd(ADDR, INDEX, V1, SCALE)                   \
+  __builtin_ia32_scatterdiv8df ((void *) (ADDR), (__mmask8)0xFF,       \
+                               (__v8di)(__m512i) (INDEX),              \
+                               (__v8df)(__m512d) (V1), (int) (SCALE))
+
+#define _mm512_mask_i64scatter_pd(ADDR, MASK, INDEX, V1, SCALE)                \
+  __builtin_ia32_scatterdiv8df ((void *) (ADDR), (__mmask8) (MASK),    \
+                               (__v8di)(__m512i) (INDEX),              \
+                               (__v8df)(__m512d) (V1), (int) (SCALE))
+
+#define _mm512_i32scatter_epi32(ADDR, INDEX, V1, SCALE)                        \
+  __builtin_ia32_scattersiv16si ((void *) (ADDR), (__mmask16)0xFFFF,   \
+                                (__v16si)(__m512i) (INDEX),            \
+                                (__v16si)(__m512i) (V1), (int) (SCALE))
+
+#define _mm512_mask_i32scatter_epi32(ADDR, MASK, INDEX, V1, SCALE)     \
+  __builtin_ia32_scattersiv16si ((void *) (ADDR), (__mmask16) (MASK),  \
+                                (__v16si)(__m512i) (INDEX),            \
+                                (__v16si)(__m512i) (V1), (int) (SCALE))
+
+#define _mm512_i32scatter_epi64(ADDR, INDEX, V1, SCALE)                        \
+  __builtin_ia32_scattersiv8di ((void *) (ADDR), (__mmask8)0xFF,       \
+                               (__v8si)(__m256i) (INDEX),              \
+                               (__v8di)(__m512i) (V1), (int) (SCALE))
+
+#define _mm512_mask_i32scatter_epi64(ADDR, MASK, INDEX, V1, SCALE)     \
+  __builtin_ia32_scattersiv8di ((void *) (ADDR), (__mmask8) (MASK),    \
+                               (__v8si)(__m256i) (INDEX),              \
+                               (__v8di)(__m512i) (V1), (int) (SCALE))
+
+#define _mm512_i64scatter_epi32(ADDR, INDEX, V1, SCALE)                        \
+  __builtin_ia32_scatterdiv16si ((void *) (ADDR), (__mmask8)0xFF,      \
+                                (__v8di)(__m512i) (INDEX),             \
+                                (__v8si)(__m256i) (V1), (int) (SCALE))
+
+#define _mm512_mask_i64scatter_epi32(ADDR, MASK, INDEX, V1, SCALE)     \
+  __builtin_ia32_scatterdiv16si ((void *) (ADDR), (__mmask8) (MASK),   \
+                                (__v8di)(__m512i) (INDEX),             \
+                                (__v8si)(__m256i) (V1), (int) (SCALE))
+
+#define _mm512_i64scatter_epi64(ADDR, INDEX, V1, SCALE)                        \
+  __builtin_ia32_scatterdiv8di ((void *) (ADDR), (__mmask8)0xFF,       \
+                               (__v8di)(__m512i) (INDEX),              \
+                               (__v8di)(__m512i) (V1), (int) (SCALE))
+
+#define _mm512_mask_i64scatter_epi64(ADDR, MASK, INDEX, V1, SCALE)     \
+  __builtin_ia32_scatterdiv8di ((void *) (ADDR), (__mmask8) (MASK),    \
+                               (__v8di)(__m512i) (INDEX),              \
+                               (__v8di)(__m512i) (V1), (int) (SCALE))
+#endif
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_compress_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
+                                                     (__v8df) __W,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_compress_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
+                                                     (__v8df)
+                                                     _mm512_setzero_pd (),
+                                                     (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m512d __A)
+{
+  __builtin_ia32_compressstoredf512_mask ((__v8df *) __P, (__v8df) __A,
+                                         (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_compress_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
+                                                    (__v16sf) __W,
+                                                    (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_compress_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
+                                                    (__v16sf)
+                                                    _mm512_setzero_ps (),
+                                                    (__mmask16) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_compressstoreu_ps (void *__P, __mmask16 __U, __m512 __A)
+{
+  __builtin_ia32_compressstoresf512_mask ((__v16sf *) __P, (__v16sf) __A,
+                                         (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_compress_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
+                                                     (__v8di) __W,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_compress_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
+                                                     (__v8di)
+                                                     _mm512_setzero_si512 (),
+                                                     (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m512i __A)
+{
+  __builtin_ia32_compressstoredi512_mask ((__v8di *) __P, (__v8di) __A,
+                                         (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_compress_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
+                                                     (__v16si) __W,
+                                                     (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
+                                                     (__v16si)
+                                                     _mm512_setzero_si512 (),
+                                                     (__mmask16) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A)
+{
+  __builtin_ia32_compressstoresi512_mask ((__v16si *) __P, (__v16si) __A,
+                                         (__mmask16) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
+                                                   (__v8df) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_expand_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_expanddf512_maskz ((__v8df) __A,
+                                                    (__v8df)
+                                                    _mm512_setzero_pd (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_expandloadu_pd (__m512d __W, __mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *) __P,
+                                                       (__v8df) __W,
+                                                       (__mmask8) __U);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_expandloadu_pd (__mmask8 __U, void const *__P)
+{
+  return (__m512d) __builtin_ia32_expandloaddf512_maskz ((const __v8df *) __P,
+                                                        (__v8df)
+                                                        _mm512_setzero_pd (),
+                                                        (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_expand_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
+                                                  (__v16sf) __W,
+                                                  (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_expand_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_expandsf512_maskz ((__v16sf) __A,
+                                                   (__v16sf)
+                                                   _mm512_setzero_ps (),
+                                                   (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_expandloadu_ps (__m512 __W, __mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *) __P,
+                                                      (__v16sf) __W,
+                                                      (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_expandloadu_ps (__mmask16 __U, void const *__P)
+{
+  return (__m512) __builtin_ia32_expandloadsf512_maskz ((const __v16sf *) __P,
+                                                       (__v16sf)
+                                                       _mm512_setzero_ps (),
+                                                       (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_expand_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
+                                                   (__v8di) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_expand_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_expanddi512_maskz ((__v8di) __A,
+                                                    (__v8di)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_expandloadu_epi64 (__m512i __W, __mmask8 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *) __P,
+                                                       (__v8di) __W,
+                                                       (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P)
+{
+  return (__m512i)
+        __builtin_ia32_expandloaddi512_maskz ((const __v8di *) __P,
+                                              (__v8di)
+                                              _mm512_setzero_si512 (),
+                                              (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_expand_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
+                                                   (__v16si) __W,
+                                                   (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_expandsi512_maskz ((__v16si) __A,
+                                                    (__v16si)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_expandloadu_epi32 (__m512i __W, __mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *) __P,
+                                                       (__v16si) __W,
+                                                       (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_expandloadu_epi32 (__mmask16 __U, void const *__P)
+{
+  return (__m512i) __builtin_ia32_expandloadsi512_maskz ((const __v16si *) __P,
+                                                        (__v16si)
+                                                        _mm512_setzero_si512
+                                                        (), (__mmask16) __U);
+}
+
+/* Mask arithmetic operations */
+#define _kand_mask16 _mm512_kand
+#define _kandn_mask16 _mm512_kandn
+#define _knot_mask16 _mm512_knot
+#define _kor_mask16 _mm512_kor
+#define _kxnor_mask16 _mm512_kxnor
+#define _kxor_mask16 _mm512_kxor
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kortest_mask16_u8  (__mmask16 __A,  __mmask16 __B, unsigned char *__CF)
+{
+  *__CF = (unsigned char) __builtin_ia32_kortestchi (__A, __B);
+  return (unsigned char) __builtin_ia32_kortestzhi (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kortestz_mask16_u8 (__mmask16 __A, __mmask16 __B)
+{
+  return (unsigned char) __builtin_ia32_kortestzhi ((__mmask16) __A,
+                                                   (__mmask16) __B);
+}
+
+extern __inline unsigned char
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kortestc_mask16_u8 (__mmask16 __A, __mmask16 __B)
+{
+  return (unsigned char) __builtin_ia32_kortestchi ((__mmask16) __A,
+                                                   (__mmask16) __B);
+}
+
+extern __inline unsigned int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_cvtmask16_u32 (__mmask16 __A)
+{
+  return (unsigned int) __builtin_ia32_kmovw ((__mmask16 ) __A);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_cvtu32_mask16 (unsigned int __A)
+{
+  return (__mmask16) __builtin_ia32_kmovw ((__mmask16 ) __A);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_load_mask16 (__mmask16 *__A)
+{
+  return (__mmask16) __builtin_ia32_kmovw (*(__mmask16 *) __A);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_store_mask16 (__mmask16 *__A, __mmask16 __B)
+{
+  *(__mmask16 *) __A = __builtin_ia32_kmovw (__B);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_kand (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_kandn (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A,
+                                            (__mmask16) __B);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_kor (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_kortestz (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kortestzhi ((__mmask16) __A,
+                                               (__mmask16) __B);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_kortestc (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kortestchi ((__mmask16) __A,
+                                               (__mmask16) __B);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_kxnor (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_kxor (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_knot (__mmask16 __A)
+{
+  return (__mmask16) __builtin_ia32_knothi ((__mmask16) __A);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_kunpackb (__mmask16 __A, __mmask16 __B)
+{
+  return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_kunpackb_mask16 (__mmask8 __A, __mmask8 __B)
+{
+  return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_inserti32x4 (__mmask16 __B, __m512i __C, __m128i __D,
+                         const int __imm)
+{
+  return (__m512i) __builtin_ia32_inserti32x4_mask ((__v16si) __C,
+                                                   (__v4si) __D,
+                                                   __imm,
+                                                   (__v16si)
+                                                   _mm512_setzero_si512 (),
+                                                   __B);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_insertf32x4 (__mmask16 __B, __m512 __C, __m128 __D,
+                         const int __imm)
+{
+  return (__m512) __builtin_ia32_insertf32x4_mask ((__v16sf) __C,
+                                                  (__v4sf) __D,
+                                                  __imm,
+                                                  (__v16sf)
+                                                  _mm512_setzero_ps (), __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_inserti32x4 (__m512i __A, __mmask16 __B, __m512i __C,
+                        __m128i __D, const int __imm)
+{
+  return (__m512i) __builtin_ia32_inserti32x4_mask ((__v16si) __C,
+                                                   (__v4si) __D,
+                                                   __imm,
+                                                   (__v16si) __A,
+                                                   __B);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_insertf32x4 (__m512 __A, __mmask16 __B, __m512 __C,
+                        __m128 __D, const int __imm)
+{
+  return (__m512) __builtin_ia32_insertf32x4_mask ((__v16sf) __C,
+                                                  (__v4sf) __D,
+                                                  __imm,
+                                                  (__v16sf) __A, __B);
+}
+#else
+#define _mm512_maskz_insertf32x4(A, X, Y, C)                            \
+  ((__m512) __builtin_ia32_insertf32x4_mask ((__v16sf)(__m512) (X),     \
+    (__v4sf)(__m128) (Y), (int) (C), (__v16sf)_mm512_setzero_ps(),      \
+    (__mmask16)(A)))
+
+#define _mm512_maskz_inserti32x4(A, X, Y, C)                            \
+  ((__m512i) __builtin_ia32_inserti32x4_mask ((__v16si)(__m512i) (X),   \
+    (__v4si)(__m128i) (Y), (int) (C), (__v16si)_mm512_setzero_si512 (),     \
+    (__mmask16)(A)))
+
+#define _mm512_mask_insertf32x4(A, B, X, Y, C)                          \
+  ((__m512) __builtin_ia32_insertf32x4_mask ((__v16sf)(__m512) (X),     \
+    (__v4sf)(__m128) (Y), (int) (C), (__v16sf)(__m512) (A),             \
+                                            (__mmask16)(B)))
+
+#define _mm512_mask_inserti32x4(A, B, X, Y, C)                          \
+  ((__m512i) __builtin_ia32_inserti32x4_mask ((__v16si)(__m512i) (X),   \
+    (__v4si)(__m128i) (Y), (int) (C), (__v16si)(__m512i) (A),           \
+                                             (__mmask16)(B)))
+#endif
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
+                                                 (__v8di) __B,
+                                                 (__v8di)
+                                                 _mm512_undefined_epi32 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
+                                                 (__v8di) __B,
+                                                 (__v8di)
+                                                 _mm512_setzero_si512 (),
+                                                 __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
+                                                 (__v8di) __B,
+                                                 (__v8di) __W, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_epi64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
+                                                 (__v8di) __B,
+                                                 (__v8di)
+                                                 _mm512_undefined_epi32 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
+                                                 (__v8di) __B,
+                                                 (__v8di) __W, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
+                                                 (__v8di) __B,
+                                                 (__v8di)
+                                                 _mm512_setzero_si512 (),
+                                                 __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_epu64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
+                                                 (__v8di) __B,
+                                                 (__v8di)
+                                                 _mm512_undefined_epi32 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
+                                                 (__v8di) __B,
+                                                 (__v8di)
+                                                 _mm512_setzero_si512 (),
+                                                 __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
+                                                 (__v8di) __B,
+                                                 (__v8di) __W, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_epu64 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
+                                                 (__v8di) __B,
+                                                 (__v8di)
+                                                 _mm512_undefined_epi32 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
+                                                 (__v8di) __B,
+                                                 (__v8di) __W, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
+                                                 (__v8di) __B,
+                                                 (__v8di)
+                                                 _mm512_setzero_si512 (),
+                                                 __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
+                                                 (__v16si) __B,
+                                                 (__v16si)
+                                                 _mm512_undefined_epi32 (),
+                                                 (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
+                                                 (__v16si) __B,
+                                                 (__v16si)
+                                                 _mm512_setzero_si512 (),
+                                                 __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
+                                                 (__v16si) __B,
+                                                 (__v16si) __W, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_epi32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
+                                                 (__v16si) __B,
+                                                 (__v16si)
+                                                 _mm512_undefined_epi32 (),
+                                                 (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
+                                                 (__v16si) __B,
+                                                 (__v16si)
+                                                 _mm512_setzero_si512 (),
+                                                 __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
+                                                 (__v16si) __B,
+                                                 (__v16si) __W, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_epu32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
+                                                 (__v16si) __B,
+                                                 (__v16si)
+                                                 _mm512_undefined_epi32 (),
+                                                 (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
+                                                 (__v16si) __B,
+                                                 (__v16si)
+                                                 _mm512_setzero_si512 (),
+                                                 __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
+                                                 (__v16si) __B,
+                                                 (__v16si) __W, __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_epu32 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
+                                                 (__v16si) __B,
+                                                 (__v16si)
+                                                 _mm512_undefined_epi32 (),
+                                                 (__mmask16) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
+                                                 (__v16si) __B,
+                                                 (__v16si)
+                                                 _mm512_setzero_si512 (),
+                                                 __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
+                                                 (__v16si) __B,
+                                                 (__v16si) __W, __M);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_unpacklo_ps (__m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_unpcklps512_mask ((__v16sf) __A,
+                                                  (__v16sf) __B,
+                                                  (__v16sf)
+                                                  _mm512_undefined_ps (),
+                                                  (__mmask16) -1);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_unpacklo_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_unpcklps512_mask ((__v16sf) __A,
+                                                  (__v16sf) __B,
+                                                  (__v16sf) __W,
+                                                  (__mmask16) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_unpcklps512_mask ((__v16sf) __A,
+                                                  (__v16sf) __B,
+                                                  (__v16sf)
+                                                  _mm512_setzero_ps (),
+                                                  (__mmask16) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_round_sd (__m128d __A, __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_maxsd_round ((__v2df) __A,
+                                              (__v2df) __B,
+                                              __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_round_sd (__m128d __W, __mmask8 __U, __m128d __A,
+                         __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df) __W,
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
+                          const int __R)
+{
+  return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df)
+                                                _mm_setzero_pd (),
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_round_ss (__m128 __A, __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_maxss_round ((__v4sf) __A,
+                                             (__v4sf) __B,
+                                             __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_round_ss (__m128 __W, __mmask8 __U, __m128 __A,
+                         __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A,
+                                                (__v4sf) __B,
+                                                (__v4sf) __W,
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
+                          const int __R)
+{
+  return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A,
+                                                (__v4sf) __B,
+                                                (__v4sf)
+                                                _mm_setzero_ps (),
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_round_sd (__m128d __A, __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_minsd_round ((__v2df) __A,
+                                              (__v2df) __B,
+                                              __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_round_sd (__m128d __W, __mmask8 __U, __m128d __A,
+                         __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df) __W,
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_round_sd (__mmask8 __U, __m128d __A, __m128d __B,
+                          const int __R)
+{
+  return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df)
+                                                _mm_setzero_pd (),
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_round_ss (__m128 __A, __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_minss_round ((__v4sf) __A,
+                                             (__v4sf) __B,
+                                             __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_round_ss (__m128 __W, __mmask8 __U, __m128 __A,
+                         __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A,
+                                                (__v4sf) __B,
+                                                (__v4sf) __W,
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_round_ss (__mmask8 __U, __m128 __A, __m128 __B,
+                          const int __R)
+{
+  return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A,
+                                                (__v4sf) __B,
+                                                (__v4sf)
+                                                _mm_setzero_ps (),
+                                                (__mmask8) __U, __R);
+}
+
+#else
+#define _mm_max_round_sd(A, B, C)            \
+    (__m128d)__builtin_ia32_maxsd_round(A, B, C)
+
+#define _mm_mask_max_round_sd(W, U, A, B, C) \
+    (__m128d)__builtin_ia32_maxsd_mask_round(A, B, W, U, C)
+
+#define _mm_maskz_max_round_sd(U, A, B, C)   \
+    (__m128d)__builtin_ia32_maxsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C)
+
+#define _mm_max_round_ss(A, B, C)            \
+    (__m128)__builtin_ia32_maxss_round(A, B, C)
+
+#define _mm_mask_max_round_ss(W, U, A, B, C) \
+    (__m128)__builtin_ia32_maxss_mask_round(A, B, W, U, C)
+
+#define _mm_maskz_max_round_ss(U, A, B, C)   \
+    (__m128)__builtin_ia32_maxss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C)
+
+#define _mm_min_round_sd(A, B, C)            \
+    (__m128d)__builtin_ia32_minsd_round(A, B, C)
+
+#define _mm_mask_min_round_sd(W, U, A, B, C) \
+    (__m128d)__builtin_ia32_minsd_mask_round(A, B, W, U, C)
+
+#define _mm_maskz_min_round_sd(U, A, B, C)   \
+    (__m128d)__builtin_ia32_minsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C)
+
+#define _mm_min_round_ss(A, B, C)            \
+    (__m128)__builtin_ia32_minss_round(A, B, C)
+
+#define _mm_mask_min_round_ss(W, U, A, B, C) \
+    (__m128)__builtin_ia32_minss_mask_round(A, B, W, U, C)
+
+#define _mm_maskz_min_round_ss(U, A, B, C)   \
+    (__m128)__builtin_ia32_minss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C)
+
+#endif
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_blend_pd (__mmask8 __U, __m512d __A, __m512d __W)
+{
+  return (__m512d) __builtin_ia32_blendmpd_512_mask ((__v8df) __A,
+                                                    (__v8df) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_blend_ps (__mmask16 __U, __m512 __A, __m512 __W)
+{
+  return (__m512) __builtin_ia32_blendmps_512_mask ((__v16sf) __A,
+                                                   (__v16sf) __W,
+                                                   (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_blend_epi64 (__mmask8 __U, __m512i __A, __m512i __W)
+{
+  return (__m512i) __builtin_ia32_blendmq_512_mask ((__v8di) __A,
+                                                   (__v8di) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_blend_epi32 (__mmask16 __U, __m512i __A, __m512i __W)
+{
+  return (__m512i) __builtin_ia32_blendmd_512_mask ((__v16si) __A,
+                                                   (__v16si) __W,
+                                                   (__mmask16) __U);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W,
+                                                  (__v2df) __A,
+                                                  (__v2df) __B,
+                                                  __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W,
+                                                 (__v4sf) __A,
+                                                 (__v4sf) __B,
+                                                 __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmsub_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W,
+                                                  (__v2df) __A,
+                                                  -(__v2df) __B,
+                                                  __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmsub_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W,
+                                                 (__v4sf) __A,
+                                                 -(__v4sf) __B,
+                                                 __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmadd_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W,
+                                                  -(__v2df) __A,
+                                                  (__v2df) __B,
+                                                  __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmadd_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W,
+                                                 -(__v4sf) __A,
+                                                 (__v4sf) __B,
+                                                 __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmsub_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W,
+                                                  -(__v2df) __A,
+                                                  -(__v2df) __B,
+                                                  __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmsub_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R)
+{
+  return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W,
+                                                 -(__v4sf) __A,
+                                                 -(__v4sf) __B,
+                                                 __R);
+}
+#else
+#define _mm_fmadd_round_sd(A, B, C, R)            \
+    (__m128d)__builtin_ia32_vfmaddsd3_round(A, B, C, R)
+
+#define _mm_fmadd_round_ss(A, B, C, R)            \
+    (__m128)__builtin_ia32_vfmaddss3_round(A, B, C, R)
+
+#define _mm_fmsub_round_sd(A, B, C, R)            \
+    (__m128d)__builtin_ia32_vfmaddsd3_round(A, B, -(C), R)
+
+#define _mm_fmsub_round_ss(A, B, C, R)            \
+    (__m128)__builtin_ia32_vfmaddss3_round(A, B, -(C), R)
+
+#define _mm_fnmadd_round_sd(A, B, C, R)            \
+    (__m128d)__builtin_ia32_vfmaddsd3_round(A, -(B), C, R)
+
+#define _mm_fnmadd_round_ss(A, B, C, R)            \
+   (__m128)__builtin_ia32_vfmaddss3_round(A, -(B), C, R)
+
+#define _mm_fnmsub_round_sd(A, B, C, R)            \
+    (__m128d)__builtin_ia32_vfmaddsd3_round(A, -(B), -(C), R)
+
+#define _mm_fnmsub_round_ss(A, B, C, R)            \
+    (__m128)__builtin_ia32_vfmaddss3_round(A, -(B), -(C), R)
+#endif
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W,
+                                                 (__v2df) __A,
+                                                 (__v2df) __B,
+                                                 (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+                                                (__v4sf) __A,
+                                                (__v4sf) __B,
+                                                (__mmask8) __U,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmadd_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W,
+                                                  (__v2df) __A,
+                                                  (__v2df) __B,
+                                                  (__mmask8) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmadd_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W,
+                                                 (__v4sf) __A,
+                                                 (__v4sf) __B,
+                                                 (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmadd_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W,
+                                                  (__v2df) __A,
+                                                  (__v2df) __B,
+                                                  (__mmask8) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmadd_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W,
+                                                 (__v4sf) __A,
+                                                 (__v4sf) __B,
+                                                 (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W,
+                                                 (__v2df) __A,
+                                                 -(__v2df) __B,
+                                                 (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+                                                (__v4sf) __A,
+                                                -(__v4sf) __B,
+                                                (__mmask8) __U,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmsub_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W,
+                                                  (__v2df) __A,
+                                                  (__v2df) __B,
+                                                  (__mmask8) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmsub_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W,
+                                                 (__v4sf) __A,
+                                                 (__v4sf) __B,
+                                                 (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmsub_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W,
+                                                  (__v2df) __A,
+                                                  -(__v2df) __B,
+                                                  (__mmask8) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmsub_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W,
+                                                 (__v4sf) __A,
+                                                 -(__v4sf) __B,
+                                                 (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W,
+                                                 -(__v2df) __A,
+                                                 (__v2df) __B,
+                                                 (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+                                                -(__v4sf) __A,
+                                                (__v4sf) __B,
+                                                (__mmask8) __U,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmadd_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W,
+                                                  -(__v2df) __A,
+                                                  (__v2df) __B,
+                                                  (__mmask8) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmadd_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W,
+                                                 -(__v4sf) __A,
+                                                 (__v4sf) __B,
+                                                 (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W,
+                                                  -(__v2df) __A,
+                                                  (__v2df) __B,
+                                                  (__mmask8) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W,
+                                                 -(__v4sf) __A,
+                                                 (__v4sf) __B,
+                                                 (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W,
+                                                 -(__v2df) __A,
+                                                 -(__v2df) __B,
+                                                 (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+                                                -(__v4sf) __A,
+                                                -(__v4sf) __B,
+                                                (__mmask8) __U,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmsub_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W,
+                                                  -(__v2df) __A,
+                                                  (__v2df) __B,
+                                                  (__mmask8) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmsub_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W,
+                                                 -(__v4sf) __A,
+                                                 (__v4sf) __B,
+                                                 (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W,
+                                                  -(__v2df) __A,
+                                                  -(__v2df) __B,
+                                                  (__mmask8) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W,
+                                                 -(__v4sf) __A,
+                                                 -(__v4sf) __B,
+                                                 (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmadd_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B,
+                        const int __R)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W,
+                                                 (__v2df) __A,
+                                                 (__v2df) __B,
+                                                 (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmadd_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B,
+                        const int __R)
+{
+  return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+                                                (__v4sf) __A,
+                                                (__v4sf) __B,
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmadd_round_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U,
+                         const int __R)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W,
+                                                  (__v2df) __A,
+                                                  (__v2df) __B,
+                                                  (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmadd_round_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U,
+                         const int __R)
+{
+  return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W,
+                                                 (__v4sf) __A,
+                                                 (__v4sf) __B,
+                                                 (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmadd_round_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B,
+                         const int __R)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W,
+                                                  (__v2df) __A,
+                                                  (__v2df) __B,
+                                                  (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmadd_round_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B,
+                         const int __R)
+{
+  return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W,
+                                                 (__v4sf) __A,
+                                                 (__v4sf) __B,
+                                                 (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmsub_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B,
+                        const int __R)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W,
+                                                 (__v2df) __A,
+                                                 -(__v2df) __B,
+                                                 (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmsub_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B,
+                        const int __R)
+{
+  return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+                                                (__v4sf) __A,
+                                                -(__v4sf) __B,
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmsub_round_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U,
+                         const int __R)
+{
+  return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W,
+                                                  (__v2df) __A,
+                                                  (__v2df) __B,
+                                                  (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmsub_round_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U,
+                         const int __R)
+{
+  return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W,
+                                                 (__v4sf) __A,
+                                                 (__v4sf) __B,
+                                                 (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmsub_round_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B,
+                         const int __R)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W,
+                                                  (__v2df) __A,
+                                                  -(__v2df) __B,
+                                                  (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmsub_round_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B,
+                         const int __R)
+{
+  return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W,
+                                                 (__v4sf) __A,
+                                                 -(__v4sf) __B,
+                                                 (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmadd_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B,
+                        const int __R)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W,
+                                                 -(__v2df) __A,
+                                                 (__v2df) __B,
+                                                 (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmadd_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B,
+                        const int __R)
+{
+  return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+                                                -(__v4sf) __A,
+                                                (__v4sf) __B,
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmadd_round_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U,
+                         const int __R)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W,
+                                                  -(__v2df) __A,
+                                                  (__v2df) __B,
+                                                  (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmadd_round_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U,
+                         const int __R)
+{
+  return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W,
+                                                 -(__v4sf) __A,
+                                                 (__v4sf) __B,
+                                                 (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmadd_round_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B,
+                         const int __R)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W,
+                                                  -(__v2df) __A,
+                                                  (__v2df) __B,
+                                                  (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmadd_round_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B,
+                         const int __R)
+{
+  return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W,
+                                                 -(__v4sf) __A,
+                                                 (__v4sf) __B,
+                                                 (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmsub_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B,
+                        const int __R)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W,
+                                                 -(__v2df) __A,
+                                                 -(__v2df) __B,
+                                                 (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmsub_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B,
+                        const int __R)
+{
+  return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
+                                                -(__v4sf) __A,
+                                                -(__v4sf) __B,
+                                                (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmsub_round_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U,
+                         const int __R)
+{
+  return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W,
+                                                  -(__v2df) __A,
+                                                  (__v2df) __B,
+                                                  (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmsub_round_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U,
+                         const int __R)
+{
+  return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W,
+                                                 -(__v4sf) __A,
+                                                 (__v4sf) __B,
+                                                 (__mmask8) __U, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmsub_round_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B,
+                         const int __R)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W,
+                                                  -(__v2df) __A,
+                                                  -(__v2df) __B,
+                                                  (__mmask8) __U, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmsub_round_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B,
+                         const int __R)
+{
+  return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W,
+                                                 -(__v4sf) __A,
+                                                 -(__v4sf) __B,
+                                                 (__mmask8) __U, __R);
+}
+#else
+#define _mm_mask_fmadd_round_sd(A, U, B, C, R)            \
+    (__m128d) __builtin_ia32_vfmaddsd3_mask (A, B, C, U, R)
+
+#define _mm_mask_fmadd_round_ss(A, U, B, C, R)            \
+    (__m128) __builtin_ia32_vfmaddss3_mask (A, B, C, U, R)
+
+#define _mm_mask3_fmadd_round_sd(A, B, C, U, R)            \
+    (__m128d) __builtin_ia32_vfmaddsd3_mask3 (A, B, C, U, R)
+
+#define _mm_mask3_fmadd_round_ss(A, B, C, U, R)            \
+    (__m128) __builtin_ia32_vfmaddss3_mask3 (A, B, C, U, R)
+
+#define _mm_maskz_fmadd_round_sd(U, A, B, C, R)            \
+    (__m128d) __builtin_ia32_vfmaddsd3_maskz (A, B, C, U, R)
+
+#define _mm_maskz_fmadd_round_ss(U, A, B, C, R)            \
+    (__m128) __builtin_ia32_vfmaddss3_maskz (A, B, C, U, R)
+
+#define _mm_mask_fmsub_round_sd(A, U, B, C, R)            \
+    (__m128d) __builtin_ia32_vfmaddsd3_mask (A, B, -(C), U, R)
+
+#define _mm_mask_fmsub_round_ss(A, U, B, C, R)            \
+    (__m128) __builtin_ia32_vfmaddss3_mask (A, B, -(C), U, R)
+
+#define _mm_mask3_fmsub_round_sd(A, B, C, U, R)            \
+    (__m128d) __builtin_ia32_vfmsubsd3_mask3 (A, B, C, U, R)
+
+#define _mm_mask3_fmsub_round_ss(A, B, C, U, R)            \
+    (__m128) __builtin_ia32_vfmsubss3_mask3 (A, B, C, U, R)
+
+#define _mm_maskz_fmsub_round_sd(U, A, B, C, R)            \
+    (__m128d) __builtin_ia32_vfmaddsd3_maskz (A, B, -(C), U, R)
+
+#define _mm_maskz_fmsub_round_ss(U, A, B, C, R)            \
+    (__m128) __builtin_ia32_vfmaddss3_maskz (A, B, -(C), U, R)
+
+#define _mm_mask_fnmadd_round_sd(A, U, B, C, R)            \
+    (__m128d) __builtin_ia32_vfmaddsd3_mask (A, -(B), C, U, R)
+
+#define _mm_mask_fnmadd_round_ss(A, U, B, C, R)            \
+    (__m128) __builtin_ia32_vfmaddss3_mask (A, -(B), C, U, R)
+
+#define _mm_mask3_fnmadd_round_sd(A, B, C, U, R)            \
+    (__m128d) __builtin_ia32_vfmaddsd3_mask3 (A, -(B), C, U, R)
+
+#define _mm_mask3_fnmadd_round_ss(A, B, C, U, R)            \
+    (__m128) __builtin_ia32_vfmaddss3_mask3 (A, -(B), C, U, R)
+
+#define _mm_maskz_fnmadd_round_sd(U, A, B, C, R)            \
+    (__m128d) __builtin_ia32_vfmaddsd3_maskz (A, -(B), C, U, R)
+
+#define _mm_maskz_fnmadd_round_ss(U, A, B, C, R)            \
+    (__m128) __builtin_ia32_vfmaddss3_maskz (A, -(B), C, U, R)
+
+#define _mm_mask_fnmsub_round_sd(A, U, B, C, R)            \
+    (__m128d) __builtin_ia32_vfmaddsd3_mask (A, -(B), -(C), U, R)
+
+#define _mm_mask_fnmsub_round_ss(A, U, B, C, R)            \
+    (__m128) __builtin_ia32_vfmaddss3_mask (A, -(B), -(C), U, R)
+
+#define _mm_mask3_fnmsub_round_sd(A, B, C, U, R)            \
+    (__m128d) __builtin_ia32_vfmsubsd3_mask3 (A, -(B), C, U, R)
+
+#define _mm_mask3_fnmsub_round_ss(A, B, C, U, R)            \
+    (__m128) __builtin_ia32_vfmsubss3_mask3 (A, -(B), C, U, R)
+
+#define _mm_maskz_fnmsub_round_sd(U, A, B, C, R)            \
+    (__m128d) __builtin_ia32_vfmaddsd3_maskz (A, -(B), -(C), U, R)
+
+#define _mm_maskz_fnmsub_round_ss(U, A, B, C, R)            \
+    (__m128) __builtin_ia32_vfmaddss3_maskz (A, -(B), -(C), U, R)
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comi_round_ss (__m128 __A, __m128 __B, const int __P, const int __R)
+{
+  return __builtin_ia32_vcomiss ((__v4sf) __A, (__v4sf) __B, __P, __R);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comi_round_sd (__m128d __A, __m128d __B, const int __P, const int __R)
+{
+  return __builtin_ia32_vcomisd ((__v2df) __A, (__v2df) __B, __P, __R);
+}
+#else
+#define _mm_comi_round_ss(A, B, C, D)\
+__builtin_ia32_vcomiss(A, B, C, D)
+#define _mm_comi_round_sd(A, B, C, D)\
+__builtin_ia32_vcomisd(A, B, C, D)
+#endif
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sqrt_pd (__m512d __A)
+{
+  return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
+                                                 (__v8df)
+                                                 _mm512_undefined_pd (),
+                                                 (__mmask8) -1,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
+                                                 (__v8df) __W,
+                                                 (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
+                                                 (__v8df)
+                                                 _mm512_setzero_pd (),
+                                                 (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sqrt_ps (__m512 __A)
+{
+  return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A,
+                                                (__v16sf)
+                                                _mm512_undefined_ps (),
+                                                (__mmask16) -1,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sqrt_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A,
+                                                (__v16sf) __W,
+                                                (__mmask16) __U,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sqrt_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A,
+                                                (__v16sf)
+                                                _mm512_setzero_ps (),
+                                                (__mmask16) __U,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_add_pd (__m512d __A, __m512d __B)
+{
+  return (__m512d) ((__v8df)__A + (__v8df)__B);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_add_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df) __W,
+                                                (__mmask8) __U,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_add_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df)
+                                                _mm512_setzero_pd (),
+                                                (__mmask8) __U,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_add_ps (__m512 __A, __m512 __B)
+{
+  return (__m512) ((__v16sf)__A + (__v16sf)__B);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_add_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf) __W,
+                                               (__mmask16) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_add_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf)
+                                               _mm512_setzero_ps (),
+                                               (__mmask16) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_add_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A,
+                                               (__v2df) __B,
+                                               (__v2df) __W,
+                                               (__mmask8) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_add_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A,
+                                               (__v2df) __B,
+                                               (__v2df)
+                                               _mm_setzero_pd (),
+                                               (__mmask8) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_add_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A,
+                                               (__v4sf) __B,
+                                               (__v4sf) __W,
+                                               (__mmask8) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_add_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A,
+                                               (__v4sf) __B,
+                                               (__v4sf)
+                                               _mm_setzero_ps (),
+                                               (__mmask8) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sub_pd (__m512d __A, __m512d __B)
+{
+  return (__m512d) ((__v8df)__A - (__v8df)__B);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sub_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df) __W,
+                                                (__mmask8) __U,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sub_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df)
+                                                _mm512_setzero_pd (),
+                                                (__mmask8) __U,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sub_ps (__m512 __A, __m512 __B)
+{
+  return (__m512) ((__v16sf)__A - (__v16sf)__B);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sub_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf) __W,
+                                               (__mmask16) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sub_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf)
+                                               _mm512_setzero_ps (),
+                                               (__mmask16) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A,
+                                               (__v2df) __B,
+                                               (__v2df) __W,
+                                               (__mmask8) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sub_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A,
+                                               (__v2df) __B,
+                                               (__v2df)
+                                               _mm_setzero_pd (),
+                                               (__mmask8) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A,
+                                               (__v4sf) __B,
+                                               (__v4sf) __W,
+                                               (__mmask8) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sub_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A,
+                                               (__v4sf) __B,
+                                               (__v4sf)
+                                               _mm_setzero_ps (),
+                                               (__mmask8) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mul_pd (__m512d __A, __m512d __B)
+{
+  return (__m512d) ((__v8df)__A * (__v8df)__B);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mul_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df) __W,
+                                                (__mmask8) __U,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mul_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df)
+                                                _mm512_setzero_pd (),
+                                                (__mmask8) __U,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mul_ps (__m512 __A, __m512 __B)
+{
+  return (__m512) ((__v16sf)__A * (__v16sf)__B);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mul_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf) __W,
+                                               (__mmask16) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mul_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf)
+                                               _mm512_setzero_ps (),
+                                               (__mmask16) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mul_sd (__m128d __W, __mmask8 __U, __m128d __A,
+                         __m128d __B)
+{
+  return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df) __W,
+                                                (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mul_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df)
+                                                _mm_setzero_pd (),
+                                                (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mul_ss (__m128 __W, __mmask8 __U, __m128 __A,
+                         __m128 __B)
+{
+  return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A,
+                                                (__v4sf) __B,
+                                                (__v4sf) __W,
+                                                (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mul_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A,
+                                                (__v4sf) __B,
+                                                (__v4sf)
+                                                _mm_setzero_ps (),
+                                                (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_div_pd (__m512d __M, __m512d __V)
+{
+  return (__m512d) ((__v8df)__M / (__v8df)__V);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_div_pd (__m512d __W, __mmask8 __U, __m512d __M, __m512d __V)
+{
+  return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M,
+                                                (__v8df) __V,
+                                                (__v8df) __W,
+                                                (__mmask8) __U,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_div_pd (__mmask8 __U, __m512d __M, __m512d __V)
+{
+  return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M,
+                                                (__v8df) __V,
+                                                (__v8df)
+                                                _mm512_setzero_pd (),
+                                                (__mmask8) __U,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_div_ps (__m512 __A, __m512 __B)
+{
+  return (__m512) ((__v16sf)__A / (__v16sf)__B);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_div_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf) __W,
+                                               (__mmask16) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_div_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf)
+                                               _mm512_setzero_ps (),
+                                               (__mmask16) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_div_sd (__m128d __W, __mmask8 __U, __m128d __A,
+                         __m128d __B)
+{
+  return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df) __W,
+                                                (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_div_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df)
+                                                _mm_setzero_pd (),
+                                                (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_div_ss (__m128 __W, __mmask8 __U, __m128 __A,
+                         __m128 __B)
+{
+  return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A,
+                                                (__v4sf) __B,
+                                                (__v4sf) __W,
+                                                (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_div_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A,
+                                                (__v4sf) __B,
+                                                (__v4sf)
+                                                _mm_setzero_ps (),
+                                                (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_pd (__m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df)
+                                                _mm512_undefined_pd (),
+                                                (__mmask8) -1,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df) __W,
+                                                (__mmask8) __U,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df)
+                                                _mm512_setzero_pd (),
+                                                (__mmask8) __U,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_ps (__m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf)
+                                               _mm512_undefined_ps (),
+                                               (__mmask16) -1,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf) __W,
+                                               (__mmask16) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf)
+                                               _mm512_setzero_ps (),
+                                               (__mmask16) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df) __W,
+                                                (__mmask8) __U,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df)
+                                                _mm_setzero_pd (),
+                                                (__mmask8) __U,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A,
+                                               (__v4sf) __B,
+                                               (__v4sf) __W,
+                                               (__mmask8) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A,
+                                               (__v4sf) __B,
+                                               (__v4sf)
+                                               _mm_setzero_ps (),
+                                               (__mmask8) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_pd (__m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df)
+                                                _mm512_undefined_pd (),
+                                                (__mmask8) -1,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df) __W,
+                                                (__mmask8) __U,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
+                                                (__v8df) __B,
+                                                (__v8df)
+                                                _mm512_setzero_pd (),
+                                                (__mmask8) __U,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_ps (__m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf)
+                                               _mm512_undefined_ps (),
+                                               (__mmask16) -1,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf) __W,
+                                               (__mmask16) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
+                                               (__v16sf) __B,
+                                               (__v16sf)
+                                               _mm512_setzero_ps (),
+                                               (__mmask16) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df) __W,
+                                                (__mmask8) __U,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df)
+                                                _mm_setzero_pd (),
+                                                (__mmask8) __U,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A,
+                                               (__v4sf) __B,
+                                               (__v4sf) __W,
+                                               (__mmask8) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A,
+                                               (__v4sf) __B,
+                                               (__v4sf)
+                                               _mm_setzero_ps (),
+                                               (__mmask8) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_scalef_pd (__m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+                                                   (__v8df) __B,
+                                                   (__v8df)
+                                                   _mm512_undefined_pd (),
+                                                   (__mmask8) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_scalef_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+                                                   (__v8df) __B,
+                                                   (__v8df) __W,
+                                                   (__mmask8) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B)
+{
+  return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
+                                                   (__v8df) __B,
+                                                   (__v8df)
+                                                   _mm512_setzero_pd (),
+                                                   (__mmask8) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_scalef_ps (__m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+                                                  (__v16sf) __B,
+                                                  (__v16sf)
+                                                  _mm512_undefined_ps (),
+                                                  (__mmask16) -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_scalef_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+                                                  (__v16sf) __B,
+                                                  (__v16sf) __W,
+                                                  (__mmask16) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B)
+{
+  return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
+                                                  (__v16sf) __B,
+                                                  (__v16sf)
+                                                  _mm512_setzero_ps (),
+                                                  (__mmask16) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_scalef_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A,
+                                                   (__v2df) __B,
+                                                   (__v2df)
+                                                   _mm_setzero_pd (),
+                                                   (__mmask8) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_scalef_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A,
+                                                  (__v4sf) __B,
+                                                  (__v4sf)
+                                                  _mm_setzero_ps (),
+                                                  (__mmask8) -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmadd_pd (__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                   (__v8df) __B,
+                                                   (__v8df) __C,
+                                                   (__mmask8) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmadd_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
+                                                   (__v8df) __B,
+                                                   (__v8df) __C,
+                                                   (__mmask8) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmadd_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmadd_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmadd_ps (__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                  (__v16sf) __B,
+                                                  (__v16sf) __C,
+                                                  (__mmask16) -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmadd_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
+                                                  (__v16sf) __B,
+                                                  (__v16sf) __C,
+                                                  (__mmask16) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmadd_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmadd_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmsub_pd (__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmsubpd512_mask ((__v8df) __A,
+                                                   (__v8df) __B,
+                                                   (__v8df) __C,
+                                                   (__mmask8) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmsub_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmsubpd512_mask ((__v8df) __A,
+                                                   (__v8df) __B,
+                                                   (__v8df) __C,
+                                                   (__mmask8) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmsub_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmsub_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmsubpd512_maskz ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmsub_ps (__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmsubps512_mask ((__v16sf) __A,
+                                                  (__v16sf) __B,
+                                                  (__v16sf) __C,
+                                                  (__mmask16) -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmsub_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmsubps512_mask ((__v16sf) __A,
+                                                  (__v16sf) __B,
+                                                  (__v16sf) __C,
+                                                  (__mmask16) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmsub_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmsub_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmsubps512_maskz ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmaddsub_pd (__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                      (__v8df) __B,
+                                                      (__v8df) __C,
+                                                      (__mmask8) -1,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmaddsub_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                      (__v8df) __B,
+                                                      (__v8df) __C,
+                                                      (__mmask8) __U,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmaddsub_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       (__v8df) __C,
+                                                       (__mmask8) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmaddsub_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       (__v8df) __C,
+                                                       (__mmask8) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmaddsub_ps (__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+                                                     (__v16sf) __B,
+                                                     (__v16sf) __C,
+                                                     (__mmask16) -1,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmaddsub_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+                                                     (__v16sf) __B,
+                                                     (__v16sf) __C,
+                                                     (__mmask16) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmaddsub_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      (__v16sf) __C,
+                                                      (__mmask16) __U,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmaddsub_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      (__v16sf) __C,
+                                                      (__mmask16) __U,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmsubadd_pd (__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                      (__v8df) __B,
+                                                      -(__v8df) __C,
+                                                      (__mmask8) -1,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmsubadd_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
+                                                      (__v8df) __B,
+                                                      -(__v8df) __C,
+                                                      (__mmask8) __U,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmsubadd_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       (__v8df) __C,
+                                                       (__mmask8) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmsubadd_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
+                                                       (__v8df) __B,
+                                                       -(__v8df) __C,
+                                                       (__mmask8) __U,
+                                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmsubadd_ps (__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+                                                     (__v16sf) __B,
+                                                     -(__v16sf) __C,
+                                                     (__mmask16) -1,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmsubadd_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
+                                                     (__v16sf) __B,
+                                                     -(__v16sf) __C,
+                                                     (__mmask16) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmsubadd_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      (__v16sf) __C,
+                                                      (__mmask16) __U,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmsubadd_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
+                                                      (__v16sf) __B,
+                                                      -(__v16sf) __C,
+                                                      (__mmask16) __U,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fnmadd_pd (__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fnmadd_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fnmadd_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfnmaddpd512_mask3 ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fnmadd_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfnmaddpd512_maskz ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fnmadd_ps (__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fnmadd_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fnmadd_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfnmaddps512_mask3 ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fnmadd_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfnmaddps512_maskz ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fnmsub_pd (__m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fnmsub_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A,
+                                                    (__v8df) __B,
+                                                    (__v8df) __C,
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fnmsub_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
+{
+  return (__m512d) __builtin_ia32_vfnmsubpd512_mask3 ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fnmsub_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
+{
+  return (__m512d) __builtin_ia32_vfnmsubpd512_maskz ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8df) __C,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fnmsub_ps (__m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fnmsub_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A,
+                                                   (__v16sf) __B,
+                                                   (__v16sf) __C,
+                                                   (__mmask16) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fnmsub_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
+{
+  return (__m512) __builtin_ia32_vfnmsubps512_mask3 ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fnmsub_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
+{
+  return (__m512) __builtin_ia32_vfnmsubps512_maskz ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16sf) __C,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttpd_epi32 (__m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
+                                                    (__v8si)
+                                                    _mm256_undefined_si256 (),
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
+                                                    (__v8si) __W,
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
+                                                    (__v8si)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttpd_epu32 (__m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+                                                     (__v8si)
+                                                     _mm256_undefined_si256 (),
+                                                     (__mmask8) -1,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+                                                     (__v8si) __W,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
+                                                     (__v8si)
+                                                     _mm256_setzero_si256 (),
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtpd_epi32 (__m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
+                                                   (__v8si)
+                                                   _mm256_undefined_si256 (),
+                                                   (__mmask8) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
+                                                   (__v8si) __W,
+                                                   (__mmask8) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
+                                                   (__v8si)
+                                                   _mm256_setzero_si256 (),
+                                                   (__mmask8) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtpd_epu32 (__m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
+                                                    (__v8si)
+                                                    _mm256_undefined_si256 (),
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
+                                                    (__v8si) __W,
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
+                                                    (__v8si)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask8) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttps_epi32 (__m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
+                                                    (__v16si)
+                                                    _mm512_undefined_epi32 (),
+                                                    (__mmask16) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
+                                                    (__v16si) __W,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
+                                                    (__v16si)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttps_epu32 (__m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
+                                                     (__v16si)
+                                                     _mm512_undefined_epi32 (),
+                                                     (__mmask16) -1,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
+                                                     (__v16si) __W,
+                                                     (__mmask16) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
+                                                     (__v16si)
+                                                     _mm512_setzero_si512 (),
+                                                     (__mmask16) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtps_epi32 (__m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
+                                                   (__v16si)
+                                                   _mm512_undefined_epi32 (),
+                                                   (__mmask16) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
+                                                   (__v16si) __W,
+                                                   (__mmask16) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
+                                                   (__v16si)
+                                                   _mm512_setzero_si512 (),
+                                                   (__mmask16) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtps_epu32 (__m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
+                                                    (__v16si)
+                                                    _mm512_undefined_epi32 (),
+                                                    (__mmask16) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
+                                                    (__v16si) __W,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtps_epu32 (__mmask16 __U, __m512 __A)
+{
+  return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
+                                                    (__v16si)
+                                                    _mm512_setzero_si512 (),
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline double
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtsd_f64 (__m512d __A)
+{
+  return __A[0];
+}
+
+extern __inline float
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtss_f32 (__m512 __A)
+{
+  return __A[0];
+}
+
+#ifdef __x86_64__
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtu64_ss (__m128 __A, unsigned long long __B)
+{
+  return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B,
+                                             _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtu64_sd (__m128d __A, unsigned long long __B)
+{
+  return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B,
+                                              _MM_FROUND_CUR_DIRECTION);
+}
+#endif
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtu32_ss (__m128 __A, unsigned __B)
+{
+  return (__m128) __builtin_ia32_cvtusi2ss32 ((__v4sf) __A, __B,
+                                             _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi32_ps (__m512i __A)
+{
+  return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
+                                                  (__v16sf)
+                                                  _mm512_undefined_ps (),
+                                                  (__mmask16) -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
+                                                  (__v16sf) __W,
+                                                  (__mmask16) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A)
+{
+  return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
+                                                  (__v16sf)
+                                                  _mm512_setzero_ps (),
+                                                  (__mmask16) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu32_ps (__m512i __A)
+{
+  return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
+                                                   (__v16sf)
+                                                   _mm512_undefined_ps (),
+                                                   (__mmask16) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
+                                                   (__v16sf) __W,
+                                                   (__mmask16) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A)
+{
+  return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
+                                                   (__v16sf)
+                                                   _mm512_setzero_ps (),
+                                                   (__mmask16) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fixupimm_pd (__m512d __A, __m512d __B, __m512i __C, const int __imm)
+{
+  return (__m512d) __builtin_ia32_fixupimmpd512_mask ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8di) __C,
+                                                     __imm,
+                                                     (__mmask8) -1,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fixupimm_pd (__m512d __A, __mmask8 __U, __m512d __B,
+                        __m512i __C, const int __imm)
+{
+  return (__m512d) __builtin_ia32_fixupimmpd512_mask ((__v8df) __A,
+                                                     (__v8df) __B,
+                                                     (__v8di) __C,
+                                                     __imm,
+                                                     (__mmask8) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fixupimm_pd (__mmask8 __U, __m512d __A, __m512d __B,
+                         __m512i __C, const int __imm)
+{
+  return (__m512d) __builtin_ia32_fixupimmpd512_maskz ((__v8df) __A,
+                                                      (__v8df) __B,
+                                                      (__v8di) __C,
+                                                      __imm,
+                                                      (__mmask8) __U,
+                                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fixupimm_ps (__m512 __A, __m512 __B, __m512i __C, const int __imm)
+{
+  return (__m512) __builtin_ia32_fixupimmps512_mask ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16si) __C,
+                                                    __imm,
+                                                    (__mmask16) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fixupimm_ps (__m512 __A, __mmask16 __U, __m512 __B,
+                        __m512i __C, const int __imm)
+{
+  return (__m512) __builtin_ia32_fixupimmps512_mask ((__v16sf) __A,
+                                                    (__v16sf) __B,
+                                                    (__v16si) __C,
+                                                    __imm,
+                                                    (__mmask16) __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fixupimm_ps (__mmask16 __U, __m512 __A, __m512 __B,
+                         __m512i __C, const int __imm)
+{
+  return (__m512) __builtin_ia32_fixupimmps512_maskz ((__v16sf) __A,
+                                                     (__v16sf) __B,
+                                                     (__v16si) __C,
+                                                     __imm,
+                                                     (__mmask16) __U,
+                                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fixupimm_sd (__m128d __A, __m128d __B, __m128i __C, const int __imm)
+{
+  return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A,
+                                                  (__v2df) __B,
+                                                  (__v2di) __C, __imm,
+                                                  (__mmask8) -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fixupimm_sd (__m128d __A, __mmask8 __U, __m128d __B,
+                     __m128i __C, const int __imm)
+{
+  return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A,
+                                                  (__v2df) __B,
+                                                  (__v2di) __C, __imm,
+                                                  (__mmask8) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fixupimm_sd (__mmask8 __U, __m128d __A, __m128d __B,
+                      __m128i __C, const int __imm)
+{
+  return (__m128d) __builtin_ia32_fixupimmsd_maskz ((__v2df) __A,
+                                                   (__v2df) __B,
+                                                   (__v2di) __C,
+                                                   __imm,
+                                                   (__mmask8) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fixupimm_ss (__m128 __A, __m128 __B, __m128i __C, const int __imm)
+{
+  return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A,
+                                                 (__v4sf) __B,
+                                                 (__v4si) __C, __imm,
+                                                 (__mmask8) -1,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fixupimm_ss (__m128 __A, __mmask8 __U, __m128 __B,
+                     __m128i __C, const int __imm)
+{
+  return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A,
+                                                 (__v4sf) __B,
+                                                 (__v4si) __C, __imm,
+                                                 (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fixupimm_ss (__mmask8 __U, __m128 __A, __m128 __B,
+                      __m128i __C, const int __imm)
+{
+  return (__m128) __builtin_ia32_fixupimmss_maskz ((__v4sf) __A,
+                                                  (__v4sf) __B,
+                                                  (__v4si) __C, __imm,
+                                                  (__mmask8) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+#else
+#define _mm512_fixupimm_pd(X, Y, Z, C)                                 \
+  ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X),   \
+      (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C),            \
+      (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_fixupimm_pd(X, U, Y, Z, C)                          \
+  ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X),    \
+      (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C),             \
+      (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_fixupimm_pd(U, X, Y, Z, C)                         \
+  ((__m512d)__builtin_ia32_fixupimmpd512_maskz ((__v8df)(__m512d)(X),   \
+      (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C),             \
+      (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_fixupimm_ps(X, Y, Z, C)                                 \
+  ((__m512)__builtin_ia32_fixupimmps512_mask ((__v16sf)(__m512)(X),    \
+    (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C),             \
+    (__mmask16)(-1), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_fixupimm_ps(X, U, Y, Z, C)                          \
+  ((__m512)__builtin_ia32_fixupimmps512_mask ((__v16sf)(__m512)(X),     \
+    (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C),              \
+    (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_fixupimm_ps(U, X, Y, Z, C)                         \
+  ((__m512)__builtin_ia32_fixupimmps512_maskz ((__v16sf)(__m512)(X),    \
+    (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C),              \
+    (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_fixupimm_sd(X, Y, Z, C)                                    \
+    ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X),    \
+      (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C),            \
+      (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_fixupimm_sd(X, U, Y, Z, C)                            \
+    ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X),    \
+      (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C),            \
+      (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_fixupimm_sd(U, X, Y, Z, C)                           \
+    ((__m128d)__builtin_ia32_fixupimmsd_maskz ((__v2df)(__m128d)(X),   \
+      (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C),            \
+      (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_fixupimm_ss(X, Y, Z, C)                                    \
+    ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X),      \
+      (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C),             \
+      (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_fixupimm_ss(X, U, Y, Z, C)                            \
+    ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X),      \
+      (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C),             \
+      (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_fixupimm_ss(U, X, Y, Z, C)                           \
+    ((__m128)__builtin_ia32_fixupimmss_maskz ((__v4sf)(__m128)(X),     \
+      (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C),             \
+      (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+#endif
+
+#ifdef __x86_64__
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_u64 (__m128 __A)
+{
+  return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf)
+                                                          __A,
+                                                          _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttss_u64 (__m128 __A)
+{
+  return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf)
+                                                           __A,
+                                                           _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttss_i64 (__m128 __A)
+{
+  return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+#endif /* __x86_64__ */
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtsi512_si32 (__m512i __A)
+{
+  __v16si __B = (__v16si) __A;
+  return __B[0];
+}
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_u32 (__m128 __A)
+{
+  return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttss_u32 (__m128 __A)
+{
+  return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttss_i32 (__m128 __A)
+{
+  return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A,
+                                           _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_i32 (__m128d __A)
+{
+  return (int) __builtin_ia32_cvtsd2si ((__v2df) __A);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_i32 (__m128 __A)
+{
+  return (int) __builtin_ia32_cvtss2si ((__v4sf) __A);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvti32_sd (__m128d __A, int __B)
+{
+  return (__m128d) __builtin_ia32_cvtsi2sd ((__v2df) __A, __B);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvti32_ss (__m128 __A, int __B)
+{
+  return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B);
+}
+
+#ifdef __x86_64__
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_u64 (__m128d __A)
+{
+  return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df)
+                                                          __A,
+                                                          _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsd_u64 (__m128d __A)
+{
+  return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df)
+                                                           __A,
+                                                           _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsd_i64 (__m128d __A)
+{
+  return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_i64 (__m128d __A)
+{
+  return (long long) __builtin_ia32_cvtsd2si64 ((__v2df) __A);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_i64 (__m128 __A)
+{
+  return (long long) __builtin_ia32_cvtss2si64 ((__v4sf) __A);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvti64_sd (__m128d __A, long long __B)
+{
+  return (__m128d) __builtin_ia32_cvtsi642sd ((__v2df) __A, __B);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvti64_ss (__m128 __A, long long __B)
+{
+  return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
+}
+#endif /* __x86_64__ */
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_u32 (__m128d __A)
+{
+  return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsd_u32 (__m128d __A)
+{
+  return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsd_i32 (__m128d __A)
+{
+  return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A,
+                                           _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtps_pd (__m256 __A)
+{
+  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
+                                                   (__v8df)
+                                                   _mm512_undefined_pd (),
+                                                   (__mmask8) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A)
+{
+  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
+                                                   (__v8df) __W,
+                                                   (__mmask8) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A)
+{
+  return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
+                                                   (__v8df)
+                                                   _mm512_setzero_pd (),
+                                                   (__mmask8) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtph_ps (__m256i __A)
+{
+  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
+                                                   (__v16sf)
+                                                   _mm512_undefined_ps (),
+                                                   (__mmask16) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A)
+{
+  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
+                                                   (__v16sf) __W,
+                                                   (__mmask16) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A)
+{
+  return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
+                                                   (__v16sf)
+                                                   _mm512_setzero_ps (),
+                                                   (__mmask16) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtpd_ps (__m512d __A)
+{
+  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
+                                                  (__v8sf)
+                                                  _mm256_undefined_ps (),
+                                                  (__mmask8) -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A)
+{
+  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
+                                                  (__v8sf) __W,
+                                                  (__mmask8) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A)
+{
+  return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
+                                                  (__v8sf)
+                                                  _mm256_setzero_ps (),
+                                                  (__mmask8) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getexp_ps (__m512 __A)
+{
+  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+                                                  (__v16sf)
+                                                  _mm512_undefined_ps (),
+                                                  (__mmask16) -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getexp_ps (__m512 __W, __mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+                                                  (__v16sf) __W,
+                                                  (__mmask16) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A)
+{
+  return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
+                                                  (__v16sf)
+                                                  _mm512_setzero_ps (),
+                                                  (__mmask16) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getexp_pd (__m512d __A)
+{
+  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+                                                   (__v8df)
+                                                   _mm512_undefined_pd (),
+                                                   (__mmask8) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getexp_pd (__m512d __W, __mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+                                                   (__v8df) __W,
+                                                   (__mmask8) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A)
+{
+  return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
+                                                   (__v8df)
+                                                   _mm512_setzero_pd (),
+                                                   (__mmask8) __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getexp_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_getexpss128_round ((__v4sf) __A,
+                                                   (__v4sf) __B,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getexp_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_getexpss_mask_round ((__v4sf) __A,
+                                               (__v4sf) __B,
+                                               (__v4sf) __W,
+                                               (__mmask8) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_getexpss_mask_round ((__v4sf) __A,
+                                               (__v4sf) __B,
+                                               (__v4sf)
+                                               _mm_setzero_ps (),
+                                               (__mmask8) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getexp_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_getexpsd128_round ((__v2df) __A,
+                                                    (__v2df) __B,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_getexpsd_mask_round ((__v2df) __A,
+                                               (__v2df) __B,
+                                               (__v2df) __W,
+                                               (__mmask8) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_getexpsd_mask_round ((__v2df) __A,
+                                               (__v2df) __B,
+                                               (__v2df)
+                                               _mm_setzero_pd (),
+                                               (__mmask8) __U,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getmant_pd (__m512d __A, _MM_MANTISSA_NORM_ENUM __B,
+                  _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A,
+                                                    (__C << 2) | __B,
+                                                    _mm512_undefined_pd (),
+                                                    (__mmask8) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getmant_pd (__m512d __W, __mmask8 __U, __m512d __A,
+                       _MM_MANTISSA_NORM_ENUM __B, _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A,
+                                                    (__C << 2) | __B,
+                                                    (__v8df) __W, __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getmant_pd (__mmask8 __U, __m512d __A,
+                        _MM_MANTISSA_NORM_ENUM __B, _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A,
+                                                    (__C << 2) | __B,
+                                                    (__v8df)
+                                                    _mm512_setzero_pd (),
+                                                    __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getmant_ps (__m512 __A, _MM_MANTISSA_NORM_ENUM __B,
+                  _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A,
+                                                   (__C << 2) | __B,
+                                                   _mm512_undefined_ps (),
+                                                   (__mmask16) -1,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getmant_ps (__m512 __W, __mmask16 __U, __m512 __A,
+                       _MM_MANTISSA_NORM_ENUM __B, _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A,
+                                                   (__C << 2) | __B,
+                                                   (__v16sf) __W, __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getmant_ps (__mmask16 __U, __m512 __A,
+                        _MM_MANTISSA_NORM_ENUM __B, _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A,
+                                                   (__C << 2) | __B,
+                                                   (__v16sf)
+                                                   _mm512_setzero_ps (),
+                                                   __U,
+                                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getmant_sd (__m128d __A, __m128d __B, _MM_MANTISSA_NORM_ENUM __C,
+               _MM_MANTISSA_SIGN_ENUM __D)
+{
+  return (__m128d) __builtin_ia32_getmantsd_round ((__v2df) __A,
+                                                  (__v2df) __B,
+                                                  (__D << 2) | __C,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getmant_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B,
+                       _MM_MANTISSA_NORM_ENUM __C, _MM_MANTISSA_SIGN_ENUM __D)
+{
+  return (__m128d) __builtin_ia32_getmantsd_mask_round ((__v2df) __A,
+                                                       (__v2df) __B,
+                                                       (__D << 2) | __C,
+                                                        (__v2df) __W,
+                                                      __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getmant_sd (__mmask8 __U, __m128d __A, __m128d __B,
+                        _MM_MANTISSA_NORM_ENUM __C, _MM_MANTISSA_SIGN_ENUM __D)
+{
+  return (__m128d) __builtin_ia32_getmantsd_mask_round ((__v2df) __A,
+                                                        (__v2df) __B,
+                                                       (__D << 2) | __C,
+                                                        (__v2df)
+                                                       _mm_setzero_pd(),
+                                                       __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getmant_ss (__m128 __A, __m128 __B, _MM_MANTISSA_NORM_ENUM __C,
+               _MM_MANTISSA_SIGN_ENUM __D)
+{
+  return (__m128) __builtin_ia32_getmantss_round ((__v4sf) __A,
+                                                 (__v4sf) __B,
+                                                 (__D << 2) | __C,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getmant_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B,
+                       _MM_MANTISSA_NORM_ENUM __C, _MM_MANTISSA_SIGN_ENUM __D)
+{
+  return (__m128) __builtin_ia32_getmantss_mask_round ((__v4sf) __A,
+                                                       (__v4sf) __B,
+                                                       (__D << 2) | __C,
+                                                        (__v4sf) __W,
+                                                      __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getmant_ss (__mmask8 __U, __m128 __A, __m128 __B,
+                        _MM_MANTISSA_NORM_ENUM __C, _MM_MANTISSA_SIGN_ENUM __D)
+{
+  return (__m128) __builtin_ia32_getmantss_mask_round ((__v4sf) __A,
+                                                        (__v4sf) __B,
+                                                       (__D << 2) | __C,
+                                                        (__v4sf)
+                                                       _mm_setzero_ps(),
+                                                       __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+#else
+#define _mm512_getmant_pd(X, B, C)                                                  \
+  ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X),                 \
+                                              (int)(((C)<<2) | (B)),                \
+                                              (__v8df)_mm512_undefined_pd(),        \
+                                              (__mmask8)-1,\
+                                             _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_getmant_pd(W, U, X, B, C)                                       \
+  ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X),                 \
+                                              (int)(((C)<<2) | (B)),                \
+                                              (__v8df)(__m512d)(W),                 \
+                                              (__mmask8)(U),\
+                                             _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_getmant_pd(U, X, B, C)                                         \
+  ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X),                 \
+                                              (int)(((C)<<2) | (B)),                \
+                                              (__v8df)_mm512_setzero_pd(),          \
+                                              (__mmask8)(U),\
+                                             _MM_FROUND_CUR_DIRECTION))
+#define _mm512_getmant_ps(X, B, C)                                                  \
+  ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X),                  \
+                                             (int)(((C)<<2) | (B)),                 \
+                                             (__v16sf)_mm512_undefined_ps(),        \
+                                             (__mmask16)-1,\
+                                            _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_getmant_ps(W, U, X, B, C)                                       \
+  ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X),                  \
+                                             (int)(((C)<<2) | (B)),                 \
+                                             (__v16sf)(__m512)(W),                  \
+                                             (__mmask16)(U),\
+                                            _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_getmant_ps(U, X, B, C)                                         \
+  ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X),                  \
+                                             (int)(((C)<<2) | (B)),                 \
+                                             (__v16sf)_mm512_setzero_ps(),          \
+                                             (__mmask16)(U),\
+                                            _MM_FROUND_CUR_DIRECTION))
+#define _mm_getmant_sd(X, Y, C, D)                                                  \
+  ((__m128d)__builtin_ia32_getmantsd_round ((__v2df)(__m128d)(X),                    \
+                                           (__v2df)(__m128d)(Y),                    \
+                                           (int)(((D)<<2) | (C)),                   \
+                                          _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_getmant_sd(W, U, X, Y, C, D)                                       \
+  ((__m128d)__builtin_ia32_getmantsd_mask_round ((__v2df)(__m128d)(X),                 \
+                                                 (__v2df)(__m128d)(Y),                 \
+                                                 (int)(((D)<<2) | (C)),                \
+                                                (__v2df)(__m128d)(W),                 \
+                                              (__mmask8)(U),\
+                                             _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_getmant_sd(U, X, Y, C, D)                                         \
+  ((__m128d)__builtin_ia32_getmantsd_mask_round ((__v2df)(__m128d)(X),                 \
+                                           (__v2df)(__m128d)(Y),                     \
+                                              (int)(((D)<<2) | (C)),                \
+                                           (__v2df)_mm_setzero_pd(),             \
+                                              (__mmask8)(U),\
+                                             _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_getmant_ss(X, Y, C, D)                                                  \
+  ((__m128)__builtin_ia32_getmantss_round ((__v4sf)(__m128)(X),                      \
+                                          (__v4sf)(__m128)(Y),                      \
+                                          (int)(((D)<<2) | (C)),                    \
+                                         _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_getmant_ss(W, U, X, Y, C, D)                                       \
+  ((__m128)__builtin_ia32_getmantss_mask_round ((__v4sf)(__m128)(X),                 \
+                                                 (__v4sf)(__m128)(Y),                 \
+                                                 (int)(((D)<<2) | (C)),                \
+                                                (__v4sf)(__m128)(W),                 \
+                                              (__mmask8)(U),\
+                                             _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_getmant_ss(U, X, Y, C, D)                                         \
+  ((__m128)__builtin_ia32_getmantss_mask_round ((__v4sf)(__m128)(X),                 \
+                                           (__v4sf)(__m128)(Y),                     \
+                                              (int)(((D)<<2) | (C)),                \
+                                           (__v4sf)_mm_setzero_ps(),             \
+                                              (__mmask8)(U),\
+                                             _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_getexp_ss(A, B)                                                  \
+  ((__m128)__builtin_ia32_getexpss128_round((__v4sf)(__m128)(A), (__v4sf)(__m128)(B),  \
+                                          _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_getexp_ss(W, U, A, B) \
+    (__m128)__builtin_ia32_getexpss_mask_round(A, B, W, U,\
+                                             _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_getexp_ss(U, A, B)   \
+    (__m128)__builtin_ia32_getexpss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U,\
+                                             _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_getexp_sd(A, B)                                                   \
+  ((__m128d)__builtin_ia32_getexpsd128_round((__v2df)(__m128d)(A), (__v2df)(__m128d)(B),\
+                                           _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_getexp_sd(W, U, A, B) \
+    (__m128d)__builtin_ia32_getexpsd_mask_round(A, B, W, U,\
+                                             _MM_FROUND_CUR_DIRECTION)
+
+#define _mm_maskz_getexp_sd(U, A, B)   \
+    (__m128d)__builtin_ia32_getexpsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U,\
+                                             _MM_FROUND_CUR_DIRECTION)
+
+#define _mm512_getexp_ps(A)                                            \
+  ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A),               \
+  (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_getexp_ps(W, U, A)                                 \
+  ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A),               \
+  (__v16sf)(__m512)(W), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_getexp_ps(U, A)                                   \
+  ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A),               \
+  (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_getexp_pd(A)                                            \
+  ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A),              \
+  (__v8df)_mm512_undefined_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_getexp_pd(W, U, A)                                 \
+  ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A),              \
+  (__v8df)(__m512d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_getexp_pd(U, A)                                   \
+  ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A),              \
+  (__v8df)_mm512_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_roundscale_ps (__m512 __A, const int __imm)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, __imm,
+                                                 (__v16sf)
+                                                 _mm512_undefined_ps (),
+                                                 -1,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_roundscale_ps (__m512 __A, __mmask16 __B, __m512 __C,
+                          const int __imm)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __C, __imm,
+                                                 (__v16sf) __A,
+                                                 (__mmask16) __B,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_roundscale_ps (__mmask16 __A, __m512 __B, const int __imm)
+{
+  return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __B,
+                                                 __imm,
+                                                 (__v16sf)
+                                                 _mm512_setzero_ps (),
+                                                 (__mmask16) __A,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_roundscale_pd (__m512d __A, const int __imm)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, __imm,
+                                                  (__v8df)
+                                                  _mm512_undefined_pd (),
+                                                  -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_roundscale_pd (__m512d __A, __mmask8 __B, __m512d __C,
+                          const int __imm)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __C, __imm,
+                                                  (__v8df) __A,
+                                                  (__mmask8) __B,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_roundscale_pd (__mmask8 __A, __m512d __B, const int __imm)
+{
+  return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __B,
+                                                  __imm,
+                                                  (__v8df)
+                                                  _mm512_setzero_pd (),
+                                                  (__mmask8) __A,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roundscale_ss (__m128 __A, __m128 __B, const int __imm)
+{
+  return (__m128)
+    __builtin_ia32_rndscaless_mask_round ((__v4sf) __A,
+                                         (__v4sf) __B, __imm,
+                                         (__v4sf)
+                                         _mm_setzero_ps (),
+                                         (__mmask8) -1,
+                                         _MM_FROUND_CUR_DIRECTION);
+}
+
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_roundscale_ss (__m128 __A, __mmask8 __B, __m128 __C, __m128 __D,
+                       const int __imm)
+{
+  return (__m128)
+    __builtin_ia32_rndscaless_mask_round ((__v4sf) __C,
+                                         (__v4sf) __D, __imm,
+                                         (__v4sf) __A,
+                                         (__mmask8) __B,
+                                         _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_roundscale_ss (__mmask8 __A, __m128 __B, __m128 __C,
+                        const int __imm)
+{
+  return (__m128)
+    __builtin_ia32_rndscaless_mask_round ((__v4sf) __B,
+                                         (__v4sf) __C, __imm,
+                                         (__v4sf)
+                                         _mm_setzero_ps (),
+                                         (__mmask8) __A,
+                                         _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roundscale_sd (__m128d __A, __m128d __B, const int __imm)
+{
+  return (__m128d)
+    __builtin_ia32_rndscalesd_mask_round ((__v2df) __A,
+                                         (__v2df) __B, __imm,
+                                         (__v2df)
+                                         _mm_setzero_pd (),
+                                         (__mmask8) -1,
+                                         _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_roundscale_sd (__m128d __A, __mmask8 __B, __m128d __C, __m128d __D,
+                       const int __imm)
+{
+  return (__m128d)
+    __builtin_ia32_rndscalesd_mask_round ((__v2df) __C,
+                                         (__v2df) __D, __imm,
+                                         (__v2df) __A,
+                                         (__mmask8) __B,
+                                         _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_roundscale_sd (__mmask8 __A, __m128d __B, __m128d __C,
+                        const int __imm)
+{
+  return (__m128d)
+    __builtin_ia32_rndscalesd_mask_round ((__v2df) __B,
+                                         (__v2df) __C, __imm,
+                                         (__v2df)
+                                         _mm_setzero_pd (),
+                                         (__mmask8) __A,
+                                         _MM_FROUND_CUR_DIRECTION);
+}
+
+#else
+#define _mm512_roundscale_ps(A, B) \
+  ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(A), (int)(B),\
+    (__v16sf)_mm512_undefined_ps(), (__mmask16)(-1), _MM_FROUND_CUR_DIRECTION))
+#define _mm512_mask_roundscale_ps(A, B, C, D)                          \
+  ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(C),      \
+                                           (int)(D),                   \
+                                           (__v16sf)(__m512)(A),       \
+                                           (__mmask16)(B), _MM_FROUND_CUR_DIRECTION))
+#define _mm512_maskz_roundscale_ps(A, B, C)                            \
+  ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(B),      \
+                                           (int)(C),                   \
+                                           (__v16sf)_mm512_setzero_ps(),\
+                                           (__mmask16)(A), _MM_FROUND_CUR_DIRECTION))
+#define _mm512_roundscale_pd(A, B) \
+  ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(A), (int)(B),\
+    (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION))
+#define _mm512_mask_roundscale_pd(A, B, C, D)                          \
+  ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(C),     \
+                                            (int)(D),                  \
+                                            (__v8df)(__m512d)(A),      \
+                                            (__mmask8)(B), _MM_FROUND_CUR_DIRECTION))
+#define _mm512_maskz_roundscale_pd(A, B, C)                            \
+  ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(B),     \
+                                            (int)(C),                  \
+                                            (__v8df)_mm512_setzero_pd(),\
+                                            (__mmask8)(A), _MM_FROUND_CUR_DIRECTION))
+#define _mm_roundscale_ss(A, B, I)                                     \
+  ((__m128)                                                            \
+   __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (A),                \
+                                        (__v4sf) (__m128) (B),         \
+                                        (int) (I),                     \
+                                        (__v4sf) _mm_setzero_ps (),    \
+                                        (__mmask8) (-1),               \
+                                        _MM_FROUND_CUR_DIRECTION))
+#define _mm_mask_roundscale_ss(A, U, B, C, I)                          \
+  ((__m128)                                                            \
+   __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (B),                \
+                                        (__v4sf) (__m128) (C),         \
+                                        (int) (I),                     \
+                                        (__v4sf) (__m128) (A),         \
+                                        (__mmask8) (U),                \
+                                        _MM_FROUND_CUR_DIRECTION))
+#define _mm_maskz_roundscale_ss(U, A, B, I)                            \
+  ((__m128)                                                            \
+   __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (A),                \
+                                        (__v4sf) (__m128) (B),         \
+                                        (int) (I),                     \
+                                        (__v4sf) _mm_setzero_ps (),    \
+                                        (__mmask8) (U),                \
+                                        _MM_FROUND_CUR_DIRECTION))
+#define _mm_roundscale_sd(A, B, I)                                     \
+  ((__m128d)                                                           \
+   __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (A),       \
+                                        (__v2df) (__m128d) (B),        \
+                                        (int) (I),                     \
+                                        (__v2df) _mm_setzero_pd (),    \
+                                        (__mmask8) (-1),               \
+                                        _MM_FROUND_CUR_DIRECTION))
+#define _mm_mask_roundscale_sd(A, U, B, C, I)                          \
+  ((__m128d)                                                           \
+   __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (B),       \
+                                        (__v2df) (__m128d) (C),        \
+                                        (int) (I),                     \
+                                        (__v2df) (__m128d) (A),        \
+                                        (__mmask8) (U),                \
+                                        _MM_FROUND_CUR_DIRECTION))
+#define _mm_maskz_roundscale_sd(U, A, B, I)                            \
+  ((__m128d)                                                           \
+   __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (A),       \
+                                        (__v2df) (__m128d) (B),        \
+                                        (int) (I),                     \
+                                        (__v2df) _mm_setzero_pd (),    \
+                                        (__mmask8) (U),                \
+                                        _MM_FROUND_CUR_DIRECTION))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_pd_mask (__m512d __X, __m512d __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+                                                 (__v8df) __Y, __P,
+                                                 (__mmask8) -1,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_ps_mask (__m512 __X, __m512 __Y, const int __P)
+{
+  return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+                                                  (__v16sf) __Y, __P,
+                                                  (__mmask16) -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y, const int __P)
+{
+  return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+                                                  (__v16sf) __Y, __P,
+                                                  (__mmask16) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+                                                 (__v8df) __Y, __P,
+                                                 (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_sd_mask (__m128d __X, __m128d __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X,
+                                              (__v2df) __Y, __P,
+                                              (__mmask8) -1,
+                                              _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_sd_mask (__mmask8 __M, __m128d __X, __m128d __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X,
+                                              (__v2df) __Y, __P,
+                                              (__mmask8) __M,
+                                              _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_ss_mask (__m128 __X, __m128 __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X,
+                                              (__v4sf) __Y, __P,
+                                              (__mmask8) -1,
+                                              _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_ss_mask (__mmask8 __M, __m128 __X, __m128 __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X,
+                                              (__v4sf) __Y, __P,
+                                              (__mmask8) __M,
+                                              _MM_FROUND_CUR_DIRECTION);
+}
+
+#else
+#define _mm512_cmp_pd_mask(X, Y, P)                                    \
+  ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X),      \
+                                           (__v8df)(__m512d)(Y), (int)(P),\
+                                           (__mmask8)-1,_MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_cmp_ps_mask(X, Y, P)                                    \
+  ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X),     \
+                                            (__v16sf)(__m512)(Y), (int)(P),\
+                                            (__mmask16)-1,_MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_cmp_pd_mask(M, X, Y, P)                                    \
+  ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X),      \
+                                           (__v8df)(__m512d)(Y), (int)(P),\
+                                           (__mmask8)(M), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_cmp_ps_mask(M, X, Y, P)                                    \
+  ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X),     \
+                                            (__v16sf)(__m512)(Y), (int)(P),\
+                                            (__mmask16)(M),_MM_FROUND_CUR_DIRECTION))
+
+#define _mm_cmp_sd_mask(X, Y, P)                                       \
+  ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X),         \
+                                        (__v2df)(__m128d)(Y), (int)(P),\
+                                        (__mmask8)-1,_MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_cmp_sd_mask(M, X, Y, P)                                       \
+  ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X),         \
+                                        (__v2df)(__m128d)(Y), (int)(P),\
+                                        M,_MM_FROUND_CUR_DIRECTION))
+
+#define _mm_cmp_ss_mask(X, Y, P)                                       \
+  ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X),          \
+                                        (__v4sf)(__m128)(Y), (int)(P), \
+                                        (__mmask8)-1,_MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_cmp_ss_mask(M, X, Y, P)                                       \
+  ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X),          \
+                                        (__v4sf)(__m128)(Y), (int)(P), \
+                                        M,_MM_FROUND_CUR_DIRECTION))
+#endif
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpeq_pd_mask (__m512d __X, __m512d __Y)
+{
+  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+                                                 (__v8df) __Y, _CMP_EQ_OQ,
+                                                 (__mmask8) -1,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpeq_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y)
+{
+  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+                                                 (__v8df) __Y, _CMP_EQ_OQ,
+                                                 (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmplt_pd_mask (__m512d __X, __m512d __Y)
+{
+  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+                                                 (__v8df) __Y, _CMP_LT_OS,
+                                                 (__mmask8) -1,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmplt_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y)
+{
+  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+                                                 (__v8df) __Y, _CMP_LT_OS,
+                                                 (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmple_pd_mask (__m512d __X, __m512d __Y)
+{
+  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+                                                 (__v8df) __Y, _CMP_LE_OS,
+                                                 (__mmask8) -1,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmple_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y)
+{
+  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+                                                 (__v8df) __Y, _CMP_LE_OS,
+                                                 (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpunord_pd_mask (__m512d __X, __m512d __Y)
+{
+  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+                                                 (__v8df) __Y, _CMP_UNORD_Q,
+                                                 (__mmask8) -1,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpunord_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y)
+{
+  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+                                                 (__v8df) __Y, _CMP_UNORD_Q,
+                                                 (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpneq_pd_mask (__m512d __X, __m512d __Y)
+{
+  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+                                                 (__v8df) __Y, _CMP_NEQ_UQ,
+                                                 (__mmask8) -1,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpneq_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y)
+{
+  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+                                                 (__v8df) __Y, _CMP_NEQ_UQ,
+                                                 (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpnlt_pd_mask (__m512d __X, __m512d __Y)
+{
+  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+                                                 (__v8df) __Y, _CMP_NLT_US,
+                                                 (__mmask8) -1,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpnlt_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y)
+{
+  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+                                                 (__v8df) __Y, _CMP_NLT_US,
+                                                 (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpnle_pd_mask (__m512d __X, __m512d __Y)
+{
+  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+                                                 (__v8df) __Y, _CMP_NLE_US,
+                                                 (__mmask8) -1,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpnle_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y)
+{
+  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+                                                 (__v8df) __Y, _CMP_NLE_US,
+                                                 (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpord_pd_mask (__m512d __X, __m512d __Y)
+{
+  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+                                                 (__v8df) __Y, _CMP_ORD_Q,
+                                                 (__mmask8) -1,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpord_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y)
+{
+  return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X,
+                                                 (__v8df) __Y, _CMP_ORD_Q,
+                                                 (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpeq_ps_mask (__m512 __X, __m512 __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+                                                  (__v16sf) __Y, _CMP_EQ_OQ,
+                                                  (__mmask16) -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpeq_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y)
+{
+   return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+                                                  (__v16sf) __Y, _CMP_EQ_OQ,
+                                                  (__mmask16) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmplt_ps_mask (__m512 __X, __m512 __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+                                                  (__v16sf) __Y, _CMP_LT_OS,
+                                                  (__mmask16) -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmplt_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y)
+{
+   return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+                                                  (__v16sf) __Y, _CMP_LT_OS,
+                                                  (__mmask16) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmple_ps_mask (__m512 __X, __m512 __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+                                                  (__v16sf) __Y, _CMP_LE_OS,
+                                                  (__mmask16) -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmple_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y)
+{
+   return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+                                                  (__v16sf) __Y, _CMP_LE_OS,
+                                                  (__mmask16) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpunord_ps_mask (__m512 __X, __m512 __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+                                                  (__v16sf) __Y, _CMP_UNORD_Q,
+                                                  (__mmask16) -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpunord_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y)
+{
+   return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+                                                  (__v16sf) __Y, _CMP_UNORD_Q,
+                                                  (__mmask16) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpneq_ps_mask (__m512 __X, __m512 __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+                                                  (__v16sf) __Y, _CMP_NEQ_UQ,
+                                                  (__mmask16) -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpneq_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y)
+{
+   return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+                                                  (__v16sf) __Y, _CMP_NEQ_UQ,
+                                                  (__mmask16) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpnlt_ps_mask (__m512 __X, __m512 __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+                                                  (__v16sf) __Y, _CMP_NLT_US,
+                                                  (__mmask16) -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpnlt_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y)
+{
+   return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+                                                  (__v16sf) __Y, _CMP_NLT_US,
+                                                  (__mmask16) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpnle_ps_mask (__m512 __X, __m512 __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+                                                  (__v16sf) __Y, _CMP_NLE_US,
+                                                  (__mmask16) -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpnle_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y)
+{
+   return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+                                                  (__v16sf) __Y, _CMP_NLE_US,
+                                                  (__mmask16) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpord_ps_mask (__m512 __X, __m512 __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+                                                  (__v16sf) __Y, _CMP_ORD_Q,
+                                                  (__mmask16) -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpord_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y)
+{
+   return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X,
+                                                  (__v16sf) __Y, _CMP_ORD_Q,
+                                                  (__mmask16) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_kmov (__mmask16 __A)
+{
+  return __builtin_ia32_kmovw (__A);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castpd_ps (__m512d __A)
+{
+  return (__m512) (__A);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castpd_si512 (__m512d __A)
+{
+  return (__m512i) (__A);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castps_pd (__m512 __A)
+{
+  return (__m512d) (__A);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castps_si512 (__m512 __A)
+{
+  return (__m512i) (__A);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castsi512_ps (__m512i __A)
+{
+  return (__m512) (__A);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castsi512_pd (__m512i __A)
+{
+  return (__m512d) (__A);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castpd512_pd128 (__m512d __A)
+{
+  return (__m128d)_mm512_extractf32x4_ps((__m512)__A, 0);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castps512_ps128 (__m512 __A)
+{
+  return _mm512_extractf32x4_ps(__A, 0);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castsi512_si128 (__m512i __A)
+{
+  return (__m128i)_mm512_extracti32x4_epi32((__m512i)__A, 0);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castpd512_pd256 (__m512d __A)
+{
+  return _mm512_extractf64x4_pd(__A, 0);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castps512_ps256 (__m512 __A)
+{
+  return (__m256)_mm512_extractf64x4_pd((__m512d)__A, 0);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castsi512_si256 (__m512i __A)
+{
+  return (__m256i)_mm512_extractf64x4_pd((__m512d)__A, 0);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castpd128_pd512 (__m128d __A)
+{
+  return (__m512d) __builtin_ia32_pd512_pd((__m128d)__A);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castps128_ps512 (__m128 __A)
+{
+  return (__m512) __builtin_ia32_ps512_ps((__m128)__A);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castsi128_si512 (__m128i __A)
+{
+  return (__m512i) __builtin_ia32_si512_si((__v4si)__A);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castpd256_pd512 (__m256d __A)
+{
+  return __builtin_ia32_pd512_256pd (__A);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castps256_ps512 (__m256 __A)
+{
+  return __builtin_ia32_ps512_256ps (__A);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castsi256_si512 (__m256i __A)
+{
+  return (__m512i)__builtin_ia32_si512_256si ((__v8si)__A);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_zextpd128_pd512 (__m128d __A)
+{
+  return (__m512d) _mm512_insertf32x4 (_mm512_setzero_ps (), (__m128) __A, 0);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_zextps128_ps512 (__m128 __A)
+{
+  return _mm512_insertf32x4 (_mm512_setzero_ps (), __A, 0);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_zextsi128_si512 (__m128i __A)
+{
+  return _mm512_inserti32x4 (_mm512_setzero_si512 (), __A, 0);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_zextpd256_pd512 (__m256d __A)
+{
+  return _mm512_insertf64x4 (_mm512_setzero_pd (), __A, 0);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_zextps256_ps512 (__m256 __A)
+{
+  return (__m512) _mm512_insertf64x4 (_mm512_setzero_pd (), (__m256d) __A, 0);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_zextsi256_si512 (__m256i __A)
+{
+  return _mm512_inserti64x4 (_mm512_setzero_si512 (), __A, 0);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpeq_epu32_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __A,
+                                                    (__v16si) __B, 0,
+                                                    (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpeq_epu32_mask (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __A,
+                                                    (__v16si) __B, 0, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpeq_epu64_mask (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __A,
+                                                   (__v8di) __B, 0, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpeq_epu64_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __A,
+                                                   (__v8di) __B, 0,
+                                                   (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpgt_epu32_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __A,
+                                                    (__v16si) __B, 6,
+                                                    (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpgt_epu32_mask (__mmask16 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __A,
+                                                    (__v16si) __B, 6,  __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmpgt_epu64_mask (__mmask8 __U, __m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __A,
+                                                   (__v8di) __B, 6, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmpgt_epu64_mask (__m512i __A, __m512i __B)
+{
+  return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __A,
+                                                   (__v8di) __B, 6,
+                                                   (__mmask8) -1);
+}
+
+#undef __MM512_REDUCE_OP
+#define __MM512_REDUCE_OP(op) \
+  __v8si __T1 = (__v8si) _mm512_extracti64x4_epi64 (__A, 1);           \
+  __v8si __T2 = (__v8si) _mm512_extracti64x4_epi64 (__A, 0);           \
+  __m256i __T3 = (__m256i) (__T1 op __T2);                             \
+  __v4si __T4 = (__v4si) _mm256_extracti128_si256 (__T3, 1);           \
+  __v4si __T5 = (__v4si) _mm256_extracti128_si256 (__T3, 0);           \
+  __v4si __T6 = __T4 op __T5;                                          \
+  __v4si __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 });     \
+  __v4si __T8 = __T6 op __T7;                                          \
+  return __T8[0] op __T8[1]
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_add_epi32 (__m512i __A)
+{
+  __MM512_REDUCE_OP (+);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_mul_epi32 (__m512i __A)
+{
+  __MM512_REDUCE_OP (*);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_and_epi32 (__m512i __A)
+{
+  __MM512_REDUCE_OP (&);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_or_epi32 (__m512i __A)
+{
+  __MM512_REDUCE_OP (|);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_add_epi32 (__mmask16 __U, __m512i __A)
+{
+  __A = _mm512_maskz_mov_epi32 (__U, __A);
+  __MM512_REDUCE_OP (+);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_mul_epi32 (__mmask16 __U, __m512i __A)
+{
+  __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (1), __U, __A);
+  __MM512_REDUCE_OP (*);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_and_epi32 (__mmask16 __U, __m512i __A)
+{
+  __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (~0), __U, __A);
+  __MM512_REDUCE_OP (&);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_or_epi32 (__mmask16 __U, __m512i __A)
+{
+  __A = _mm512_maskz_mov_epi32 (__U, __A);
+  __MM512_REDUCE_OP (|);
+}
+
+#undef __MM512_REDUCE_OP
+#define __MM512_REDUCE_OP(op) \
+  __m256i __T1 = (__m256i) _mm512_extracti64x4_epi64 (__A, 1);         \
+  __m256i __T2 = (__m256i) _mm512_extracti64x4_epi64 (__A, 0);         \
+  __m256i __T3 = _mm256_##op (__T1, __T2);                             \
+  __m128i __T4 = (__m128i) _mm256_extracti128_si256 (__T3, 1);         \
+  __m128i __T5 = (__m128i) _mm256_extracti128_si256 (__T3, 0);         \
+  __m128i __T6 = _mm_##op (__T4, __T5);                                        \
+  __m128i __T7 = (__m128i) __builtin_shuffle ((__v4si) __T6,           \
+                                             (__v4si) { 2, 3, 0, 1 }); \
+  __m128i __T8 = _mm_##op (__T6, __T7);                                        \
+  __m128i __T9 = (__m128i) __builtin_shuffle ((__v4si) __T8,           \
+                                             (__v4si) { 1, 0, 1, 0 }); \
+  __v4si __T10 = (__v4si) _mm_##op (__T8, __T9);                       \
+  return __T10[0]
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_min_epi32 (__m512i __A)
+{
+  __MM512_REDUCE_OP (min_epi32);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_max_epi32 (__m512i __A)
+{
+  __MM512_REDUCE_OP (max_epi32);
+}
+
+extern __inline unsigned int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_min_epu32 (__m512i __A)
+{
+  __MM512_REDUCE_OP (min_epu32);
+}
+
+extern __inline unsigned int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_max_epu32 (__m512i __A)
+{
+  __MM512_REDUCE_OP (max_epu32);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_min_epi32 (__mmask16 __U, __m512i __A)
+{
+  __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (__INT_MAX__), __U, __A);
+  __MM512_REDUCE_OP (min_epi32);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_max_epi32 (__mmask16 __U, __m512i __A)
+{
+  __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (-__INT_MAX__ - 1), __U, __A);
+  __MM512_REDUCE_OP (max_epi32);
+}
+
+extern __inline unsigned int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_min_epu32 (__mmask16 __U, __m512i __A)
+{
+  __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (~0), __U, __A);
+  __MM512_REDUCE_OP (min_epu32);
+}
+
+extern __inline unsigned int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_max_epu32 (__mmask16 __U, __m512i __A)
+{
+  __A = _mm512_maskz_mov_epi32 (__U, __A);
+  __MM512_REDUCE_OP (max_epu32);
+}
+
+#undef __MM512_REDUCE_OP
+#define __MM512_REDUCE_OP(op) \
+  __m256 __T1 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 1);    \
+  __m256 __T2 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 0);    \
+  __m256 __T3 = __T1 op __T2;                                          \
+  __m128 __T4 = _mm256_extractf128_ps (__T3, 1);                       \
+  __m128 __T5 = _mm256_extractf128_ps (__T3, 0);                       \
+  __m128 __T6 = __T4 op __T5;                                          \
+  __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 });     \
+  __m128 __T8 = __T6 op __T7;                                          \
+  return __T8[0] op __T8[1]
+
+extern __inline float
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_add_ps (__m512 __A)
+{
+  __MM512_REDUCE_OP (+);
+}
+
+extern __inline float
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_mul_ps (__m512 __A)
+{
+  __MM512_REDUCE_OP (*);
+}
+
+extern __inline float
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_add_ps (__mmask16 __U, __m512 __A)
+{
+  __A = _mm512_maskz_mov_ps (__U, __A);
+  __MM512_REDUCE_OP (+);
+}
+
+extern __inline float
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_mul_ps (__mmask16 __U, __m512 __A)
+{
+  __A = _mm512_mask_mov_ps (_mm512_set1_ps (1.0f), __U, __A);
+  __MM512_REDUCE_OP (*);
+}
+
+#undef __MM512_REDUCE_OP
+#define __MM512_REDUCE_OP(op) \
+  __m256 __T1 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 1);    \
+  __m256 __T2 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 0);    \
+  __m256 __T3 = _mm256_##op (__T1, __T2);                              \
+  __m128 __T4 = _mm256_extractf128_ps (__T3, 1);                       \
+  __m128 __T5 = _mm256_extractf128_ps (__T3, 0);                       \
+  __m128 __T6 = _mm_##op (__T4, __T5);                                 \
+  __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 });     \
+  __m128 __T8 = _mm_##op (__T6, __T7);                                 \
+  __m128 __T9 = __builtin_shuffle (__T8, (__v4si) { 1, 0, 1, 0 });     \
+  __m128 __T10 = _mm_##op (__T8, __T9);                                        \
+  return __T10[0]
+
+extern __inline float
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_min_ps (__m512 __A)
+{
+  __MM512_REDUCE_OP (min_ps);
+}
+
+extern __inline float
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_max_ps (__m512 __A)
+{
+  __MM512_REDUCE_OP (max_ps);
+}
+
+extern __inline float
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_min_ps (__mmask16 __U, __m512 __A)
+{
+  __A = _mm512_mask_mov_ps (_mm512_set1_ps (__builtin_inff ()), __U, __A);
+  __MM512_REDUCE_OP (min_ps);
+}
+
+extern __inline float
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_max_ps (__mmask16 __U, __m512 __A)
+{
+  __A = _mm512_mask_mov_ps (_mm512_set1_ps (-__builtin_inff ()), __U, __A);
+  __MM512_REDUCE_OP (max_ps);
+}
+
+#undef __MM512_REDUCE_OP
+#define __MM512_REDUCE_OP(op) \
+  __v4di __T1 = (__v4di) _mm512_extracti64x4_epi64 (__A, 1);           \
+  __v4di __T2 = (__v4di) _mm512_extracti64x4_epi64 (__A, 0);           \
+  __m256i __T3 = (__m256i) (__T1 op __T2);                             \
+  __v2di __T4 = (__v2di) _mm256_extracti128_si256 (__T3, 1);           \
+  __v2di __T5 = (__v2di) _mm256_extracti128_si256 (__T3, 0);           \
+  __v2di __T6 = __T4 op __T5;                                          \
+  return __T6[0] op __T6[1]
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_add_epi64 (__m512i __A)
+{
+  __MM512_REDUCE_OP (+);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_mul_epi64 (__m512i __A)
+{
+  __MM512_REDUCE_OP (*);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_and_epi64 (__m512i __A)
+{
+  __MM512_REDUCE_OP (&);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_or_epi64 (__m512i __A)
+{
+  __MM512_REDUCE_OP (|);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_add_epi64 (__mmask8 __U, __m512i __A)
+{
+  __A = _mm512_maskz_mov_epi64 (__U, __A);
+  __MM512_REDUCE_OP (+);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_mul_epi64 (__mmask8 __U, __m512i __A)
+{
+  __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (1LL), __U, __A);
+  __MM512_REDUCE_OP (*);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_and_epi64 (__mmask8 __U, __m512i __A)
+{
+  __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (~0LL), __U, __A);
+  __MM512_REDUCE_OP (&);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_or_epi64 (__mmask8 __U, __m512i __A)
+{
+  __A = _mm512_maskz_mov_epi64 (__U, __A);
+  __MM512_REDUCE_OP (|);
+}
+
+#undef __MM512_REDUCE_OP
+#define __MM512_REDUCE_OP(op) \
+  __m512i __T1 = _mm512_shuffle_i64x2 (__A, __A, 0x4e);                        \
+  __m512i __T2 = _mm512_##op (__A, __T1);                              \
+  __m512i __T3                                                         \
+    = (__m512i) __builtin_shuffle ((__v8di) __T2,                      \
+                                  (__v8di) { 2, 3, 0, 1, 6, 7, 4, 5 });\
+  __m512i __T4 = _mm512_##op (__T2, __T3);                             \
+  __m512i __T5                                                         \
+    = (__m512i) __builtin_shuffle ((__v8di) __T4,                      \
+                                  (__v8di) { 1, 0, 3, 2, 5, 4, 7, 6 });\
+  __v8di __T6 = (__v8di) _mm512_##op (__T4, __T5);                     \
+  return __T6[0]
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_min_epi64 (__m512i __A)
+{
+  __MM512_REDUCE_OP (min_epi64);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_max_epi64 (__m512i __A)
+{
+  __MM512_REDUCE_OP (max_epi64);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_min_epi64 (__mmask8 __U, __m512i __A)
+{
+  __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (__LONG_LONG_MAX__),
+                              __U, __A);
+  __MM512_REDUCE_OP (min_epi64);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_max_epi64 (__mmask8 __U, __m512i __A)
+{
+  __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (-__LONG_LONG_MAX__ - 1),
+                              __U, __A);
+  __MM512_REDUCE_OP (max_epi64);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_min_epu64 (__m512i __A)
+{
+  __MM512_REDUCE_OP (min_epu64);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_max_epu64 (__m512i __A)
+{
+  __MM512_REDUCE_OP (max_epu64);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_min_epu64 (__mmask8 __U, __m512i __A)
+{
+  __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (~0LL), __U, __A);
+  __MM512_REDUCE_OP (min_epu64);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_max_epu64 (__mmask8 __U, __m512i __A)
+{
+  __A = _mm512_maskz_mov_epi64 (__U, __A);
+  __MM512_REDUCE_OP (max_epu64);
+}
+
+#undef __MM512_REDUCE_OP
+#define __MM512_REDUCE_OP(op) \
+  __m256d __T1 = (__m256d) _mm512_extractf64x4_pd (__A, 1);            \
+  __m256d __T2 = (__m256d) _mm512_extractf64x4_pd (__A, 0);            \
+  __m256d __T3 = __T1 op __T2;                                         \
+  __m128d __T4 = _mm256_extractf128_pd (__T3, 1);                      \
+  __m128d __T5 = _mm256_extractf128_pd (__T3, 0);                      \
+  __m128d __T6 = __T4 op __T5;                                         \
+  return __T6[0] op __T6[1]
+
+extern __inline double
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_add_pd (__m512d __A)
+{
+  __MM512_REDUCE_OP (+);
+}
+
+extern __inline double
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_mul_pd (__m512d __A)
+{
+  __MM512_REDUCE_OP (*);
+}
+
+extern __inline double
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_add_pd (__mmask8 __U, __m512d __A)
+{
+  __A = _mm512_maskz_mov_pd (__U, __A);
+  __MM512_REDUCE_OP (+);
+}
+
+extern __inline double
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_mul_pd (__mmask8 __U, __m512d __A)
+{
+  __A = _mm512_mask_mov_pd (_mm512_set1_pd (1.0), __U, __A);
+  __MM512_REDUCE_OP (*);
+}
+
+#undef __MM512_REDUCE_OP
+#define __MM512_REDUCE_OP(op) \
+  __m256d __T1 = (__m256d) _mm512_extractf64x4_pd (__A, 1);            \
+  __m256d __T2 = (__m256d) _mm512_extractf64x4_pd (__A, 0);            \
+  __m256d __T3 = _mm256_##op (__T1, __T2);                             \
+  __m128d __T4 = _mm256_extractf128_pd (__T3, 1);                      \
+  __m128d __T5 = _mm256_extractf128_pd (__T3, 0);                      \
+  __m128d __T6 = _mm_##op (__T4, __T5);                                        \
+  __m128d __T7 = (__m128d) __builtin_shuffle (__T6, (__v2di) { 1, 0 });        \
+  __m128d __T8 = _mm_##op (__T6, __T7);                                        \
+  return __T8[0]
+
+extern __inline double
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_min_pd (__m512d __A)
+{
+  __MM512_REDUCE_OP (min_pd);
+}
+
+extern __inline double
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_max_pd (__m512d __A)
+{
+  __MM512_REDUCE_OP (max_pd);
+}
+
+extern __inline double
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_min_pd (__mmask8 __U, __m512d __A)
+{
+  __A = _mm512_mask_mov_pd (_mm512_set1_pd (__builtin_inf ()), __U, __A);
+  __MM512_REDUCE_OP (min_pd);
+}
+
+extern __inline double
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_max_pd (__mmask8 __U, __m512d __A)
+{
+  __A = _mm512_mask_mov_pd (_mm512_set1_pd (-__builtin_inf ()), __U, __A);
+  __MM512_REDUCE_OP (max_pd);
+}
+
+#undef __MM512_REDUCE_OP
+
+#ifdef __DISABLE_AVX512F__
+#undef __DISABLE_AVX512F__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512F__ */
+
+#endif /* _AVX512FINTRIN_H_INCLUDED */
diff --git a/include-gcc/avx512fp16intrin.h b/include-gcc/avx512fp16intrin.h
new file mode 100644 (file)
index 0000000..dd083e5
--- /dev/null
@@ -0,0 +1,7219 @@
+/* Copyright (C) 2019-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512FP16INTRIN_H_INCLUDED
+#define __AVX512FP16INTRIN_H_INCLUDED
+
+#ifndef __AVX512FP16__
+#pragma GCC push_options
+#pragma GCC target("avx512fp16")
+#define __DISABLE_AVX512FP16__
+#endif /* __AVX512FP16__ */
+
+/* Internal data types for implementing the intrinsics.  */
+typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16)));
+typedef _Float16 __v16hf __attribute__ ((__vector_size__ (32)));
+typedef _Float16 __v32hf __attribute__ ((__vector_size__ (64)));
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+   vector types, and their scalar components.  */
+typedef _Float16 __m128h __attribute__ ((__vector_size__ (16), __may_alias__));
+typedef _Float16 __m256h __attribute__ ((__vector_size__ (32), __may_alias__));
+typedef _Float16 __m512h __attribute__ ((__vector_size__ (64), __may_alias__));
+
+/* Unaligned version of the same type.  */
+typedef _Float16 __m128h_u __attribute__ ((__vector_size__ (16),       \
+                                          __may_alias__, __aligned__ (1)));
+typedef _Float16 __m256h_u __attribute__ ((__vector_size__ (32),       \
+                                          __may_alias__, __aligned__ (1)));
+typedef _Float16 __m512h_u __attribute__ ((__vector_size__ (64),       \
+                                          __may_alias__, __aligned__ (1)));
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_ph (_Float16 __A7, _Float16 __A6, _Float16 __A5,
+           _Float16 __A4, _Float16 __A3, _Float16 __A2,
+           _Float16 __A1, _Float16 __A0)
+{
+  return __extension__ (__m128h)(__v8hf){ __A0, __A1, __A2, __A3,
+                                         __A4, __A5, __A6, __A7 };
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_ph (_Float16 __A15, _Float16 __A14, _Float16 __A13,
+              _Float16 __A12, _Float16 __A11, _Float16 __A10,
+              _Float16 __A9, _Float16 __A8, _Float16 __A7,
+              _Float16 __A6, _Float16 __A5, _Float16 __A4,
+              _Float16 __A3, _Float16 __A2, _Float16 __A1,
+              _Float16 __A0)
+{
+  return __extension__ (__m256h)(__v16hf){ __A0, __A1, __A2, __A3,
+                                          __A4, __A5, __A6, __A7,
+                                          __A8, __A9, __A10, __A11,
+                                          __A12, __A13, __A14, __A15 };
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set_ph (_Float16 __A31, _Float16 __A30, _Float16 __A29,
+              _Float16 __A28, _Float16 __A27, _Float16 __A26,
+              _Float16 __A25, _Float16 __A24, _Float16 __A23,
+              _Float16 __A22, _Float16 __A21, _Float16 __A20,
+              _Float16 __A19, _Float16 __A18, _Float16 __A17,
+              _Float16 __A16, _Float16 __A15, _Float16 __A14,
+              _Float16 __A13, _Float16 __A12, _Float16 __A11,
+              _Float16 __A10, _Float16 __A9, _Float16 __A8,
+              _Float16 __A7, _Float16 __A6, _Float16 __A5,
+              _Float16 __A4, _Float16 __A3, _Float16 __A2,
+              _Float16 __A1, _Float16 __A0)
+{
+  return __extension__ (__m512h)(__v32hf){ __A0, __A1, __A2, __A3,
+                                          __A4, __A5, __A6, __A7,
+                                          __A8, __A9, __A10, __A11,
+                                          __A12, __A13, __A14, __A15,
+                                          __A16, __A17, __A18, __A19,
+                                          __A20, __A21, __A22, __A23,
+                                          __A24, __A25, __A26, __A27,
+                                          __A28, __A29, __A30, __A31 };
+}
+
+/* Create vectors of elements in the reversed order from _mm_set_ph,
+   _mm256_set_ph and _mm512_set_ph functions.  */
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_ph (_Float16 __A0, _Float16 __A1, _Float16 __A2,
+            _Float16 __A3, _Float16 __A4, _Float16 __A5,
+            _Float16 __A6, _Float16 __A7)
+{
+  return _mm_set_ph (__A7, __A6, __A5, __A4, __A3, __A2, __A1, __A0);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_ph (_Float16 __A0, _Float16 __A1, _Float16 __A2,
+               _Float16 __A3, _Float16 __A4, _Float16 __A5,
+               _Float16 __A6, _Float16 __A7, _Float16 __A8,
+               _Float16 __A9, _Float16 __A10, _Float16 __A11,
+               _Float16 __A12, _Float16 __A13, _Float16 __A14,
+               _Float16 __A15)
+{
+  return _mm256_set_ph (__A15, __A14, __A13, __A12, __A11, __A10, __A9,
+                       __A8, __A7, __A6, __A5, __A4, __A3, __A2, __A1,
+                       __A0);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_setr_ph (_Float16 __A0, _Float16 __A1, _Float16 __A2,
+               _Float16 __A3, _Float16 __A4, _Float16 __A5,
+               _Float16 __A6, _Float16 __A7, _Float16 __A8,
+               _Float16 __A9, _Float16 __A10, _Float16 __A11,
+               _Float16 __A12, _Float16 __A13, _Float16 __A14,
+               _Float16 __A15, _Float16 __A16, _Float16 __A17,
+               _Float16 __A18, _Float16 __A19, _Float16 __A20,
+               _Float16 __A21, _Float16 __A22, _Float16 __A23,
+               _Float16 __A24, _Float16 __A25, _Float16 __A26,
+               _Float16 __A27, _Float16 __A28, _Float16 __A29,
+               _Float16 __A30, _Float16 __A31)
+
+{
+  return _mm512_set_ph (__A31, __A30, __A29, __A28, __A27, __A26, __A25,
+                       __A24, __A23, __A22, __A21, __A20, __A19, __A18,
+                       __A17, __A16, __A15, __A14, __A13, __A12, __A11,
+                       __A10, __A9, __A8, __A7, __A6, __A5, __A4, __A3,
+                       __A2, __A1, __A0);
+}
+
+/* Broadcast _Float16 to vector.  */
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_ph (_Float16 __A)
+{
+  return _mm_set_ph (__A, __A, __A, __A, __A, __A, __A, __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_ph (_Float16 __A)
+{
+  return _mm256_set_ph (__A, __A, __A, __A, __A, __A, __A, __A,
+                       __A, __A, __A, __A, __A, __A, __A, __A);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set1_ph (_Float16 __A)
+{
+  return _mm512_set_ph (__A, __A, __A, __A, __A, __A, __A, __A,
+                       __A, __A, __A, __A, __A, __A, __A, __A,
+                       __A, __A, __A, __A, __A, __A, __A, __A,
+                       __A, __A, __A, __A, __A, __A, __A, __A);
+}
+
+/* Create a vector with all zeros.  */
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setzero_ph (void)
+{
+  return _mm_set1_ph (0.0f16);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setzero_ph (void)
+{
+  return _mm256_set1_ph (0.0f16);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_setzero_ph (void)
+{
+  return _mm512_set1_ph (0.0f16);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_undefined_ph (void)
+{
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Winit-self"
+  __m128h __Y = __Y;
+#pragma GCC diagnostic pop
+  return __Y;
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_undefined_ph (void)
+{
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Winit-self"
+  __m256h __Y = __Y;
+#pragma GCC diagnostic pop
+  return __Y;
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_undefined_ph (void)
+{
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Winit-self"
+  __m512h __Y = __Y;
+#pragma GCC diagnostic pop
+  return __Y;
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsh_h (__m128h __A)
+{
+  return __A[0];
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtsh_h (__m256h __A)
+{
+  return __A[0];
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtsh_h (__m512h __A)
+{
+  return __A[0];
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castph_ps (__m512h __a)
+{
+  return (__m512) __a;
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castph_pd (__m512h __a)
+{
+  return (__m512d) __a;
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castph_si512 (__m512h __a)
+{
+  return (__m512i) __a;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castph512_ph128 (__m512h __A)
+{
+  union
+  {
+    __m128h __a[4];
+    __m512h __v;
+  } __u = { .__v = __A };
+  return __u.__a[0];
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castph512_ph256 (__m512h __A)
+{
+  union
+  {
+    __m256h __a[2];
+    __m512h __v;
+  } __u = { .__v = __A };
+  return __u.__a[0];
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castph128_ph512 (__m128h __A)
+{
+  union
+  {
+    __m128h __a[4];
+    __m512h __v;
+  } __u;
+  __u.__a[0] = __A;
+  return __u.__v;
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castph256_ph512 (__m256h __A)
+{
+  union
+  {
+    __m256h __a[2];
+    __m512h __v;
+  } __u;
+  __u.__a[0] = __A;
+  return __u.__v;
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_zextph128_ph512 (__m128h __A)
+{
+  return (__m512h) _mm512_insertf32x4 (_mm512_setzero_ps (),
+                                      (__m128) __A, 0);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_zextph256_ph512 (__m256h __A)
+{
+  return (__m512h) _mm512_insertf64x4 (_mm512_setzero_pd (),
+                                      (__m256d) __A, 0);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castps_ph (__m512 __a)
+{
+  return (__m512h) __a;
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castpd_ph (__m512d __a)
+{
+  return (__m512h) __a;
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_castsi512_ph (__m512i __a)
+{
+  return (__m512h) __a;
+}
+
+/* Create a vector with element 0 as F and the rest zero.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_sh (_Float16 __F)
+{
+  return _mm_set_ph (0.0f16, 0.0f16, 0.0f16, 0.0f16, 0.0f16, 0.0f16, 0.0f16,
+                    __F);
+}
+
+/* Create a vector with element 0 as *P and the rest zero.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_sh (void const *__P)
+{
+  return _mm_set_ph (0.0f16, 0.0f16, 0.0f16, 0.0f16, 0.0f16, 0.0f16, 0.0f16,
+                    *(_Float16 const *) __P);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_load_ph (void const *__P)
+{
+  return *(const __m512h *) __P;
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_load_ph (void const *__P)
+{
+  return *(const __m256h *) __P;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_ph (void const *__P)
+{
+  return *(const __m128h *) __P;
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_loadu_ph (void const *__P)
+{
+  return *(const __m512h_u *) __P;
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu_ph (void const *__P)
+{
+  return *(const __m256h_u *) __P;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_ph (void const *__P)
+{
+  return *(const __m128h_u *) __P;
+}
+
+/* Stores the lower _Float16 value.  */
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_sh (void *__P, __m128h __A)
+{
+  *(_Float16 *) __P = ((__v8hf)__A)[0];
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_store_ph (void *__P, __m512h __A)
+{
+   *(__m512h *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_store_ph (void *__P, __m256h __A)
+{
+   *(__m256h *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_ph (void *__P, __m128h __A)
+{
+   *(__m128h *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_storeu_ph (void *__P, __m512h __A)
+{
+   *(__m512h_u *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu_ph (void *__P, __m256h __A)
+{
+   *(__m256h_u *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_ph (void *__P, __m128h __A)
+{
+   *(__m128h_u *) __P = __A;
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_abs_ph (__m512h __A)
+{
+  return (__m512h) _mm512_and_epi32 ( _mm512_set1_epi32 (0x7FFF7FFF),
+                                     (__m512i) __A);
+}
+
+/* Intrinsics v[add,sub,mul,div]ph.  */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_add_ph (__m512h __A, __m512h __B)
+{
+  return (__m512h) ((__v32hf) __A + (__v32hf) __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_add_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
+{
+  return __builtin_ia32_addph512_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_add_ph (__mmask32 __A, __m512h __B, __m512h __C)
+{
+  return __builtin_ia32_addph512_mask (__B, __C,
+                                      _mm512_setzero_ph (), __A);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sub_ph (__m512h __A, __m512h __B)
+{
+  return (__m512h) ((__v32hf) __A - (__v32hf) __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sub_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
+{
+  return __builtin_ia32_subph512_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sub_ph (__mmask32 __A, __m512h __B, __m512h __C)
+{
+  return __builtin_ia32_subph512_mask (__B, __C,
+                                      _mm512_setzero_ph (), __A);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mul_ph (__m512h __A, __m512h __B)
+{
+  return (__m512h) ((__v32hf) __A * (__v32hf) __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mul_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
+{
+  return __builtin_ia32_mulph512_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mul_ph (__mmask32 __A, __m512h __B, __m512h __C)
+{
+  return __builtin_ia32_mulph512_mask (__B, __C,
+                                      _mm512_setzero_ph (), __A);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_div_ph (__m512h __A, __m512h __B)
+{
+  return (__m512h) ((__v32hf) __A / (__v32hf) __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_div_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
+{
+  return __builtin_ia32_divph512_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_div_ph (__mmask32 __A, __m512h __B, __m512h __C)
+{
+  return __builtin_ia32_divph512_mask (__B, __C,
+                                      _mm512_setzero_ph (), __A);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_add_round_ph (__m512h __A, __m512h __B, const int __C)
+{
+  return __builtin_ia32_addph512_mask_round (__A, __B,
+                                            _mm512_setzero_ph (),
+                                            (__mmask32) -1, __C);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_add_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
+                         __m512h __D, const int __E)
+{
+  return __builtin_ia32_addph512_mask_round (__C, __D, __A, __B, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_add_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
+                          const int __D)
+{
+  return __builtin_ia32_addph512_mask_round (__B, __C,
+                                            _mm512_setzero_ph (),
+                                            __A, __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sub_round_ph (__m512h __A, __m512h __B, const int __C)
+{
+  return __builtin_ia32_subph512_mask_round (__A, __B,
+                                            _mm512_setzero_ph (),
+                                            (__mmask32) -1, __C);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sub_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
+                         __m512h __D, const int __E)
+{
+  return __builtin_ia32_subph512_mask_round (__C, __D, __A, __B, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sub_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
+                          const int __D)
+{
+  return __builtin_ia32_subph512_mask_round (__B, __C,
+                                            _mm512_setzero_ph (),
+                                            __A, __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mul_round_ph (__m512h __A, __m512h __B, const int __C)
+{
+  return __builtin_ia32_mulph512_mask_round (__A, __B,
+                                            _mm512_setzero_ph (),
+                                            (__mmask32) -1, __C);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_mul_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
+                         __m512h __D, const int __E)
+{
+  return __builtin_ia32_mulph512_mask_round (__C, __D, __A, __B, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_mul_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
+                          const int __D)
+{
+  return __builtin_ia32_mulph512_mask_round (__B, __C,
+                                            _mm512_setzero_ph (),
+                                            __A, __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_div_round_ph (__m512h __A, __m512h __B, const int __C)
+{
+  return __builtin_ia32_divph512_mask_round (__A, __B,
+                                            _mm512_setzero_ph (),
+                                            (__mmask32) -1, __C);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_div_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
+                         __m512h __D, const int __E)
+{
+  return __builtin_ia32_divph512_mask_round (__C, __D, __A, __B, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_div_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
+                          const int __D)
+{
+  return __builtin_ia32_divph512_mask_round (__B, __C,
+                                            _mm512_setzero_ph (),
+                                            __A, __D);
+}
+#else
+#define _mm512_add_round_ph(A, B, C)                                   \
+  ((__m512h)__builtin_ia32_addph512_mask_round((A), (B),               \
+                                              _mm512_setzero_ph (),    \
+                                              (__mmask32)-1, (C)))
+
+#define _mm512_mask_add_round_ph(A, B, C, D, E)                                \
+  ((__m512h)__builtin_ia32_addph512_mask_round((C), (D), (A), (B), (E)))
+
+#define _mm512_maskz_add_round_ph(A, B, C, D)                          \
+  ((__m512h)__builtin_ia32_addph512_mask_round((B), (C),               \
+                                              _mm512_setzero_ph (),    \
+                                              (A), (D)))
+
+#define _mm512_sub_round_ph(A, B, C)                                   \
+  ((__m512h)__builtin_ia32_subph512_mask_round((A), (B),               \
+                                              _mm512_setzero_ph (),    \
+                                              (__mmask32)-1, (C)))
+
+#define _mm512_mask_sub_round_ph(A, B, C, D, E)                                \
+  ((__m512h)__builtin_ia32_subph512_mask_round((C), (D), (A), (B), (E)))
+
+#define _mm512_maskz_sub_round_ph(A, B, C, D)                          \
+  ((__m512h)__builtin_ia32_subph512_mask_round((B), (C),               \
+                                              _mm512_setzero_ph (),    \
+                                              (A), (D)))
+
+#define _mm512_mul_round_ph(A, B, C)                                   \
+  ((__m512h)__builtin_ia32_mulph512_mask_round((A), (B),               \
+                                              _mm512_setzero_ph (),    \
+                                              (__mmask32)-1, (C)))
+
+#define _mm512_mask_mul_round_ph(A, B, C, D, E)                                \
+  ((__m512h)__builtin_ia32_mulph512_mask_round((C), (D), (A), (B), (E)))
+
+#define _mm512_maskz_mul_round_ph(A, B, C, D)                          \
+  ((__m512h)__builtin_ia32_mulph512_mask_round((B), (C),               \
+                                              _mm512_setzero_ph (),    \
+                                              (A), (D)))
+
+#define _mm512_div_round_ph(A, B, C)                                   \
+  ((__m512h)__builtin_ia32_divph512_mask_round((A), (B),               \
+                                              _mm512_setzero_ph (),    \
+                                              (__mmask32)-1, (C)))
+
+#define _mm512_mask_div_round_ph(A, B, C, D, E)                                \
+  ((__m512h)__builtin_ia32_divph512_mask_round((C), (D), (A), (B), (E)))
+
+#define _mm512_maskz_div_round_ph(A, B, C, D)                          \
+  ((__m512h)__builtin_ia32_divph512_mask_round((B), (C),               \
+                                              _mm512_setzero_ph (),    \
+                                              (A), (D)))
+#endif  /* __OPTIMIZE__  */
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_conj_pch (__m512h __A)
+{
+  return (__m512h) _mm512_xor_epi32 ((__m512i) __A, _mm512_set1_epi32 (1<<31));
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_conj_pch (__m512h __W, __mmask16 __U, __m512h __A)
+{
+  return (__m512h)
+    __builtin_ia32_movaps512_mask ((__v16sf) _mm512_conj_pch (__A),
+                                  (__v16sf) __W,
+                                  (__mmask16) __U);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_conj_pch (__mmask16 __U, __m512h __A)
+{
+  return (__m512h)
+    __builtin_ia32_movaps512_mask ((__v16sf) _mm512_conj_pch (__A),
+                                  (__v16sf) _mm512_setzero_ps (),
+                                  (__mmask16) __U);
+}
+
+/* Intrinsics of v[add,sub,mul,div]sh.  */
+extern __inline __m128h
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_sh (__m128h __A, __m128h __B)
+{
+  __A[0] += __B[0];
+  return __A;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_add_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return __builtin_ia32_addsh_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_add_sh (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return __builtin_ia32_addsh_mask (__B, __C, _mm_setzero_ph (),
+                                   __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_sh (__m128h __A, __m128h __B)
+{
+  __A[0] -= __B[0];
+  return __A;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sub_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return __builtin_ia32_subsh_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sub_sh (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return __builtin_ia32_subsh_mask (__B, __C, _mm_setzero_ph (),
+                                   __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_sh (__m128h __A, __m128h __B)
+{
+  __A[0] *= __B[0];
+  return __A;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mul_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return __builtin_ia32_mulsh_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mul_sh (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return __builtin_ia32_mulsh_mask (__B, __C, _mm_setzero_ph (), __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_sh (__m128h __A, __m128h __B)
+{
+  __A[0] /= __B[0];
+  return __A;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_div_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return __builtin_ia32_divsh_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_div_sh (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return __builtin_ia32_divsh_mask (__B, __C, _mm_setzero_ph (),
+                                   __A);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_round_sh (__m128h __A, __m128h __B, const int __C)
+{
+  return __builtin_ia32_addsh_mask_round (__A, __B,
+                                         _mm_setzero_ph (),
+                                         (__mmask8) -1, __C);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_add_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
+                      __m128h __D, const int __E)
+{
+  return __builtin_ia32_addsh_mask_round (__C, __D, __A, __B, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_add_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
+                       const int __D)
+{
+  return __builtin_ia32_addsh_mask_round (__B, __C,
+                                         _mm_setzero_ph (),
+                                         __A, __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_round_sh (__m128h __A, __m128h __B, const int __C)
+{
+  return __builtin_ia32_subsh_mask_round (__A, __B,
+                                         _mm_setzero_ph (),
+                                         (__mmask8) -1, __C);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sub_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
+                      __m128h __D, const int __E)
+{
+  return __builtin_ia32_subsh_mask_round (__C, __D, __A, __B, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sub_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
+                       const int __D)
+{
+  return __builtin_ia32_subsh_mask_round (__B, __C,
+                                         _mm_setzero_ph (),
+                                         __A, __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_round_sh (__m128h __A, __m128h __B, const int __C)
+{
+  return __builtin_ia32_mulsh_mask_round (__A, __B,
+                                         _mm_setzero_ph (),
+                                         (__mmask8) -1, __C);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mul_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
+                      __m128h __D, const int __E)
+{
+  return __builtin_ia32_mulsh_mask_round (__C, __D, __A, __B, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mul_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
+                       const int __D)
+{
+  return __builtin_ia32_mulsh_mask_round (__B, __C,
+                                         _mm_setzero_ph (),
+                                         __A, __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_round_sh (__m128h __A, __m128h __B, const int __C)
+{
+  return __builtin_ia32_divsh_mask_round (__A, __B,
+                                         _mm_setzero_ph (),
+                                         (__mmask8) -1, __C);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_div_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
+                      __m128h __D, const int __E)
+{
+  return __builtin_ia32_divsh_mask_round (__C, __D, __A, __B, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_div_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
+                       const int __D)
+{
+  return __builtin_ia32_divsh_mask_round (__B, __C,
+                                         _mm_setzero_ph (),
+                                         __A, __D);
+}
+#else
+#define _mm_add_round_sh(A, B, C)                                      \
+  ((__m128h)__builtin_ia32_addsh_mask_round ((A), (B),                 \
+                                            _mm_setzero_ph (),         \
+                                            (__mmask8)-1, (C)))
+
+#define _mm_mask_add_round_sh(A, B, C, D, E)                           \
+  ((__m128h)__builtin_ia32_addsh_mask_round ((C), (D), (A), (B), (E)))
+
+#define _mm_maskz_add_round_sh(A, B, C, D)                     \
+  ((__m128h)__builtin_ia32_addsh_mask_round ((B), (C),         \
+                                            _mm_setzero_ph (), \
+                                            (A), (D)))
+
+#define _mm_sub_round_sh(A, B, C)                                      \
+  ((__m128h)__builtin_ia32_subsh_mask_round ((A), (B),                 \
+                                            _mm_setzero_ph (),         \
+                                            (__mmask8)-1, (C)))
+
+#define _mm_mask_sub_round_sh(A, B, C, D, E)                           \
+  ((__m128h)__builtin_ia32_subsh_mask_round ((C), (D), (A), (B), (E)))
+
+#define _mm_maskz_sub_round_sh(A, B, C, D)                     \
+  ((__m128h)__builtin_ia32_subsh_mask_round ((B), (C),         \
+                                            _mm_setzero_ph (), \
+                                            (A), (D)))
+
+#define _mm_mul_round_sh(A, B, C)                                      \
+  ((__m128h)__builtin_ia32_mulsh_mask_round ((A), (B),                 \
+                                            _mm_setzero_ph (),         \
+                                            (__mmask8)-1, (C)))
+
+#define _mm_mask_mul_round_sh(A, B, C, D, E)                           \
+  ((__m128h)__builtin_ia32_mulsh_mask_round ((C), (D), (A), (B), (E)))
+
+#define _mm_maskz_mul_round_sh(A, B, C, D)                     \
+  ((__m128h)__builtin_ia32_mulsh_mask_round ((B), (C),         \
+                                            _mm_setzero_ph (), \
+                                            (A), (D)))
+
+#define _mm_div_round_sh(A, B, C)                                      \
+  ((__m128h)__builtin_ia32_divsh_mask_round ((A), (B),                 \
+                                            _mm_setzero_ph (),         \
+                                            (__mmask8)-1, (C)))
+
+#define _mm_mask_div_round_sh(A, B, C, D, E)                           \
+  ((__m128h)__builtin_ia32_divsh_mask_round ((C), (D), (A), (B), (E)))
+
+#define _mm_maskz_div_round_sh(A, B, C, D)                     \
+  ((__m128h)__builtin_ia32_divsh_mask_round ((B), (C),         \
+                                            _mm_setzero_ph (), \
+                                            (A), (D)))
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsic vmaxph vminph.  */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_ph (__m512h __A, __m512h __B)
+{
+  return __builtin_ia32_maxph512_mask (__A, __B,
+                                      _mm512_setzero_ph (),
+                                      (__mmask32) -1);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
+{
+  return __builtin_ia32_maxph512_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_ph (__mmask32 __A, __m512h __B, __m512h __C)
+{
+  return __builtin_ia32_maxph512_mask (__B, __C,
+                                      _mm512_setzero_ph (), __A);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_ph (__m512h __A, __m512h __B)
+{
+  return __builtin_ia32_minph512_mask (__A, __B,
+                                      _mm512_setzero_ph (),
+                                      (__mmask32) -1);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
+{
+  return __builtin_ia32_minph512_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_ph (__mmask32 __A, __m512h __B, __m512h __C)
+{
+  return __builtin_ia32_minph512_mask (__B, __C,
+                                      _mm512_setzero_ph (), __A);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_max_round_ph (__m512h __A, __m512h __B, const int __C)
+{
+  return __builtin_ia32_maxph512_mask_round (__A, __B,
+                                            _mm512_setzero_ph (),
+                                            (__mmask32) -1, __C);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_max_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
+                         __m512h __D, const int __E)
+{
+  return __builtin_ia32_maxph512_mask_round (__C, __D, __A, __B, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_max_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
+                          const int __D)
+{
+  return __builtin_ia32_maxph512_mask_round (__B, __C,
+                                            _mm512_setzero_ph (),
+                                            __A, __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_min_round_ph (__m512h __A, __m512h __B, const int __C)
+{
+  return __builtin_ia32_minph512_mask_round (__A, __B,
+                                            _mm512_setzero_ph (),
+                                            (__mmask32) -1, __C);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_min_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
+                         __m512h __D, const int __E)
+{
+  return __builtin_ia32_minph512_mask_round (__C, __D, __A, __B, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_min_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
+                          const int __D)
+{
+  return __builtin_ia32_minph512_mask_round (__B, __C,
+                                            _mm512_setzero_ph (),
+                                            __A, __D);
+}
+
+#else
+#define _mm512_max_round_ph(A, B, C)                           \
+  (__builtin_ia32_maxph512_mask_round ((A), (B),               \
+                                      _mm512_setzero_ph (),    \
+                                      (__mmask32)-1, (C)))
+
+#define _mm512_mask_max_round_ph(A, B, C, D, E)                                \
+  (__builtin_ia32_maxph512_mask_round ((C), (D), (A), (B), (E)))
+
+#define _mm512_maskz_max_round_ph(A, B, C, D)                  \
+  (__builtin_ia32_maxph512_mask_round ((B), (C),               \
+                                      _mm512_setzero_ph (),    \
+                                      (A), (D)))
+
+#define _mm512_min_round_ph(A, B, C)                           \
+  (__builtin_ia32_minph512_mask_round ((A), (B),               \
+                                      _mm512_setzero_ph (),    \
+                                      (__mmask32)-1, (C)))
+
+#define _mm512_mask_min_round_ph(A, B, C, D, E)                                \
+  (__builtin_ia32_minph512_mask_round ((C), (D), (A), (B), (E)))
+
+#define _mm512_maskz_min_round_ph(A, B, C, D)                  \
+  (__builtin_ia32_minph512_mask_round ((B), (C),               \
+                                      _mm512_setzero_ph (),    \
+                                      (A), (D)))
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsic vmaxsh vminsh.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_sh (__m128h __A, __m128h __B)
+{
+  __A[0] = __A[0] > __B[0] ? __A[0] : __B[0];
+  return __A;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return __builtin_ia32_maxsh_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_sh (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return __builtin_ia32_maxsh_mask (__B, __C, _mm_setzero_ph (),
+                                   __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_sh (__m128h __A, __m128h __B)
+{
+  __A[0] = __A[0] < __B[0] ? __A[0] : __B[0];
+  return __A;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return __builtin_ia32_minsh_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_sh (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return __builtin_ia32_minsh_mask (__B, __C, _mm_setzero_ph (),
+                                   __A);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_round_sh (__m128h __A, __m128h __B, const int __C)
+{
+  return __builtin_ia32_maxsh_mask_round (__A, __B,
+                                         _mm_setzero_ph (),
+                                         (__mmask8) -1, __C);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
+                      __m128h __D, const int __E)
+{
+  return __builtin_ia32_maxsh_mask_round (__C, __D, __A, __B, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
+                       const int __D)
+{
+  return __builtin_ia32_maxsh_mask_round (__B, __C,
+                                         _mm_setzero_ph (),
+                                         __A, __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_round_sh (__m128h __A, __m128h __B, const int __C)
+{
+  return __builtin_ia32_minsh_mask_round (__A, __B,
+                                         _mm_setzero_ph (),
+                                         (__mmask8) -1, __C);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
+                      __m128h __D, const int __E)
+{
+  return __builtin_ia32_minsh_mask_round (__C, __D, __A, __B, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
+                       const int __D)
+{
+  return __builtin_ia32_minsh_mask_round (__B, __C,
+                                         _mm_setzero_ph (),
+                                         __A, __D);
+}
+
+#else
+#define _mm_max_round_sh(A, B, C)                      \
+  (__builtin_ia32_maxsh_mask_round ((A), (B),          \
+                                   _mm_setzero_ph (),  \
+                                   (__mmask8)-1, (C)))
+
+#define _mm_mask_max_round_sh(A, B, C, D, E)                   \
+  (__builtin_ia32_maxsh_mask_round ((C), (D), (A), (B), (E)))
+
+#define _mm_maskz_max_round_sh(A, B, C, D)             \
+  (__builtin_ia32_maxsh_mask_round ((B), (C),          \
+                                   _mm_setzero_ph (),  \
+                                   (A), (D)))
+
+#define _mm_min_round_sh(A, B, C)                      \
+  (__builtin_ia32_minsh_mask_round ((A), (B),          \
+                                   _mm_setzero_ph (),  \
+                                   (__mmask8)-1, (C)))
+
+#define _mm_mask_min_round_sh(A, B, C, D, E)                   \
+  (__builtin_ia32_minsh_mask_round ((C), (D), (A), (B), (E)))
+
+#define _mm_maskz_min_round_sh(A, B, C, D)             \
+  (__builtin_ia32_minsh_mask_round ((B), (C),          \
+                                   _mm_setzero_ph (),  \
+                                   (A), (D)))
+
+#endif /* __OPTIMIZE__ */
+
+/* vcmpph */
+#ifdef __OPTIMIZE
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_ph_mask (__m512h __A, __m512h __B, const int __C)
+{
+  return (__mmask32) __builtin_ia32_cmpph512_mask (__A, __B, __C,
+                                                  (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_ph_mask (__mmask32 __A, __m512h __B, __m512h __C,
+                        const int __D)
+{
+  return (__mmask32) __builtin_ia32_cmpph512_mask (__B, __C, __D,
+                                                  __A);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cmp_round_ph_mask (__m512h __A, __m512h __B, const int __C,
+                         const int __D)
+{
+  return (__mmask32) __builtin_ia32_cmpph512_mask_round (__A, __B,
+                                                        __C, (__mmask32) -1,
+                                                        __D);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cmp_round_ph_mask (__mmask32 __A, __m512h __B, __m512h __C,
+                              const int __D, const int __E)
+{
+  return (__mmask32) __builtin_ia32_cmpph512_mask_round (__B, __C,
+                                                        __D, __A,
+                                                        __E);
+}
+
+#else
+#define _mm512_cmp_ph_mask(A, B, C)                    \
+  (__builtin_ia32_cmpph512_mask ((A), (B), (C), (-1)))
+
+#define _mm512_mask_cmp_ph_mask(A, B, C, D)            \
+  (__builtin_ia32_cmpph512_mask ((B), (C), (D), (A)))
+
+#define _mm512_cmp_round_ph_mask(A, B, C, D)                           \
+  (__builtin_ia32_cmpph512_mask_round ((A), (B), (C), (-1), (D)))
+
+#define _mm512_mask_cmp_round_ph_mask(A, B, C, D, E)                   \
+  (__builtin_ia32_cmpph512_mask_round ((B), (C), (D), (A), (E)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcmpsh.  */
+#ifdef __OPTIMIZE__
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_sh_mask (__m128h __A, __m128h __B, const int __C)
+{
+  return (__mmask8)
+    __builtin_ia32_cmpsh_mask_round (__A, __B,
+                                    __C, (__mmask8) -1,
+                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_sh_mask (__mmask8 __A, __m128h __B, __m128h __C,
+                     const int __D)
+{
+  return (__mmask8)
+    __builtin_ia32_cmpsh_mask_round (__B, __C,
+                                    __D, __A,
+                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_round_sh_mask (__m128h __A, __m128h __B, const int __C,
+                      const int __D)
+{
+  return (__mmask8) __builtin_ia32_cmpsh_mask_round (__A, __B,
+                                                    __C, (__mmask8) -1,
+                                                    __D);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_round_sh_mask (__mmask8 __A, __m128h __B, __m128h __C,
+                           const int __D, const int __E)
+{
+  return (__mmask8) __builtin_ia32_cmpsh_mask_round (__B, __C,
+                                                    __D, __A,
+                                                    __E);
+}
+
+#else
+#define _mm_cmp_sh_mask(A, B, C)                                       \
+  (__builtin_ia32_cmpsh_mask_round ((A), (B), (C), (-1),               \
+                                   (_MM_FROUND_CUR_DIRECTION)))
+
+#define _mm_mask_cmp_sh_mask(A, B, C, D)                               \
+  (__builtin_ia32_cmpsh_mask_round ((B), (C), (D), (A),                        \
+                                   (_MM_FROUND_CUR_DIRECTION)))
+
+#define _mm_cmp_round_sh_mask(A, B, C, D)                      \
+  (__builtin_ia32_cmpsh_mask_round ((A), (B), (C), (-1), (D)))
+
+#define _mm_mask_cmp_round_sh_mask(A, B, C, D, E)              \
+  (__builtin_ia32_cmpsh_mask_round ((B), (C), (D), (A), (E)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcomish.  */
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comieq_sh (__m128h __A, __m128h __B)
+{
+  return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_EQ_OS,
+                                         (__mmask8) -1,
+                                         _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comilt_sh (__m128h __A, __m128h __B)
+{
+  return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_LT_OS,
+                                         (__mmask8) -1,
+                                         _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comile_sh (__m128h __A, __m128h __B)
+{
+  return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_LE_OS,
+                                         (__mmask8) -1,
+                                         _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comigt_sh (__m128h __A, __m128h __B)
+{
+  return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_GT_OS,
+                                         (__mmask8) -1,
+                                         _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comige_sh (__m128h __A, __m128h __B)
+{
+  return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_GE_OS,
+                                         (__mmask8) -1,
+                                         _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comineq_sh (__m128h __A, __m128h __B)
+{
+  return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_NEQ_US,
+                                         (__mmask8) -1,
+                                         _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomieq_sh (__m128h __A, __m128h __B)
+{
+  return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_EQ_OQ,
+                                         (__mmask8) -1,
+                                         _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomilt_sh (__m128h __A, __m128h __B)
+{
+  return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_LT_OQ,
+                                         (__mmask8) -1,
+                                         _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomile_sh (__m128h __A, __m128h __B)
+{
+  return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_LE_OQ,
+                                         (__mmask8) -1,
+                                         _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomigt_sh (__m128h __A, __m128h __B)
+{
+  return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_GT_OQ,
+                                         (__mmask8) -1,
+                                         _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomige_sh (__m128h __A, __m128h __B)
+{
+  return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_GE_OQ,
+                                         (__mmask8) -1,
+                                         _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomineq_sh (__m128h __A, __m128h __B)
+{
+  return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_NEQ_UQ,
+                                         (__mmask8) -1,
+                                         _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comi_sh (__m128h __A, __m128h __B, const int __P)
+{
+  return __builtin_ia32_cmpsh_mask_round (__A, __B, __P,
+                                         (__mmask8) -1,
+                                         _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comi_round_sh (__m128h __A, __m128h __B, const int __P, const int __R)
+{
+  return __builtin_ia32_cmpsh_mask_round (__A, __B, __P,
+                                         (__mmask8) -1,__R);
+}
+
+#else
+#define _mm_comi_round_sh(A, B, P, R)                                  \
+  (__builtin_ia32_cmpsh_mask_round ((A), (B), (P), (__mmask8) (-1), (R)))
+#define _mm_comi_sh(A, B, P)                                           \
+  (__builtin_ia32_cmpsh_mask_round ((A), (B), (P), (__mmask8) (-1),    \
+                                   _MM_FROUND_CUR_DIRECTION))
+
+#endif /* __OPTIMIZE__  */
+
+/* Intrinsics vsqrtph.  */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sqrt_ph (__m512h __A)
+{
+  return __builtin_ia32_sqrtph512_mask_round (__A,
+                                             _mm512_setzero_ph(),
+                                             (__mmask32) -1,
+                                             _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sqrt_ph (__m512h __A, __mmask32 __B, __m512h __C)
+{
+  return __builtin_ia32_sqrtph512_mask_round (__C, __A, __B,
+                                             _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sqrt_ph (__mmask32 __A, __m512h __B)
+{
+  return __builtin_ia32_sqrtph512_mask_round (__B,
+                                             _mm512_setzero_ph (),
+                                             __A,
+                                             _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_sqrt_round_ph (__m512h __A, const int __B)
+{
+  return __builtin_ia32_sqrtph512_mask_round (__A,
+                                             _mm512_setzero_ph(),
+                                             (__mmask32) -1, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_sqrt_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
+                          const int __D)
+{
+  return __builtin_ia32_sqrtph512_mask_round (__C, __A, __B, __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_sqrt_round_ph (__mmask32 __A, __m512h __B, const int __C)
+{
+  return __builtin_ia32_sqrtph512_mask_round (__B,
+                                             _mm512_setzero_ph (),
+                                             __A, __C);
+}
+
+#else
+#define _mm512_sqrt_round_ph(A, B)                             \
+  (__builtin_ia32_sqrtph512_mask_round ((A),                   \
+                                       _mm512_setzero_ph (),   \
+                                       (__mmask32)-1, (B)))
+
+#define _mm512_mask_sqrt_round_ph(A, B, C, D)                  \
+  (__builtin_ia32_sqrtph512_mask_round ((C), (A), (B), (D)))
+
+#define _mm512_maskz_sqrt_round_ph(A, B, C)                    \
+  (__builtin_ia32_sqrtph512_mask_round ((B),                   \
+                                       _mm512_setzero_ph (),   \
+                                       (A), (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vrsqrtph.  */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rsqrt_ph (__m512h __A)
+{
+  return __builtin_ia32_rsqrtph512_mask (__A, _mm512_setzero_ph (),
+                                        (__mmask32) -1);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rsqrt_ph (__m512h __A, __mmask32 __B, __m512h __C)
+{
+  return __builtin_ia32_rsqrtph512_mask (__C, __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rsqrt_ph (__mmask32 __A, __m512h __B)
+{
+  return __builtin_ia32_rsqrtph512_mask (__B, _mm512_setzero_ph (),
+                                        __A);
+}
+
+/* Intrinsics vrsqrtsh.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt_sh (__m128h __A, __m128h __B)
+{
+  return __builtin_ia32_rsqrtsh_mask (__B, __A, _mm_setzero_ph (),
+                                     (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rsqrt_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return __builtin_ia32_rsqrtsh_mask (__D, __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rsqrt_sh (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return __builtin_ia32_rsqrtsh_mask (__C, __B, _mm_setzero_ph (),
+                                     __A);
+}
+
+/* Intrinsics vsqrtsh.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_sh (__m128h __A, __m128h __B)
+{
+  return __builtin_ia32_sqrtsh_mask_round (__B, __A,
+                                          _mm_setzero_ph (),
+                                          (__mmask8) -1,
+                                          _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sqrt_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return __builtin_ia32_sqrtsh_mask_round (__D, __C, __A, __B,
+                                          _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sqrt_sh (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return __builtin_ia32_sqrtsh_mask_round (__C, __B,
+                                          _mm_setzero_ph (),
+                                          __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_round_sh (__m128h __A, __m128h __B, const int __C)
+{
+  return __builtin_ia32_sqrtsh_mask_round (__B, __A,
+                                          _mm_setzero_ph (),
+                                          (__mmask8) -1, __C);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sqrt_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
+                       __m128h __D, const int __E)
+{
+  return __builtin_ia32_sqrtsh_mask_round (__D, __C, __A, __B,
+                                          __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sqrt_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
+                        const int __D)
+{
+  return __builtin_ia32_sqrtsh_mask_round (__C, __B,
+                                          _mm_setzero_ph (),
+                                          __A, __D);
+}
+
+#else
+#define _mm_sqrt_round_sh(A, B, C)                             \
+  (__builtin_ia32_sqrtsh_mask_round ((B), (A),                 \
+                                    _mm_setzero_ph (),         \
+                                    (__mmask8)-1, (C)))
+
+#define _mm_mask_sqrt_round_sh(A, B, C, D, E)                  \
+  (__builtin_ia32_sqrtsh_mask_round ((D), (C), (A), (B), (E)))
+
+#define _mm_maskz_sqrt_round_sh(A, B, C, D)            \
+  (__builtin_ia32_sqrtsh_mask_round ((C), (B),         \
+                                    _mm_setzero_ph (), \
+                                    (A), (D)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vrcpph.  */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_rcp_ph (__m512h __A)
+{
+  return __builtin_ia32_rcpph512_mask (__A, _mm512_setzero_ph (),
+                                      (__mmask32) -1);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_rcp_ph (__m512h __A, __mmask32 __B, __m512h __C)
+{
+  return __builtin_ia32_rcpph512_mask (__C, __A, __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_rcp_ph (__mmask32 __A, __m512h __B)
+{
+  return __builtin_ia32_rcpph512_mask (__B, _mm512_setzero_ph (),
+                                      __A);
+}
+
+/* Intrinsics vrcpsh.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp_sh (__m128h __A, __m128h __B)
+{
+  return __builtin_ia32_rcpsh_mask (__B, __A, _mm_setzero_ph (),
+                                   (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rcp_sh (__m128h __A, __mmask32 __B, __m128h __C, __m128h __D)
+{
+  return __builtin_ia32_rcpsh_mask (__D, __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rcp_sh (__mmask32 __A, __m128h __B, __m128h __C)
+{
+  return __builtin_ia32_rcpsh_mask (__C, __B, _mm_setzero_ph (),
+                                   __A);
+}
+
+/* Intrinsics vscalefph.  */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_scalef_ph (__m512h __A, __m512h __B)
+{
+  return __builtin_ia32_scalefph512_mask_round (__A, __B,
+                                               _mm512_setzero_ph (),
+                                               (__mmask32) -1,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_scalef_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D)
+{
+  return __builtin_ia32_scalefph512_mask_round (__C, __D, __A, __B,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_scalef_ph (__mmask32 __A, __m512h __B, __m512h __C)
+{
+  return __builtin_ia32_scalefph512_mask_round (__B, __C,
+                                               _mm512_setzero_ph (),
+                                               __A,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_scalef_round_ph (__m512h __A, __m512h __B, const int __C)
+{
+  return __builtin_ia32_scalefph512_mask_round (__A, __B,
+                                               _mm512_setzero_ph (),
+                                               (__mmask32) -1, __C);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_scalef_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
+                            __m512h __D, const int __E)
+{
+  return __builtin_ia32_scalefph512_mask_round (__C, __D, __A, __B,
+                                               __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_scalef_round_ph (__mmask32 __A, __m512h __B, __m512h __C,
+                             const int __D)
+{
+  return __builtin_ia32_scalefph512_mask_round (__B, __C,
+                                               _mm512_setzero_ph (),
+                                               __A, __D);
+}
+
+#else
+#define _mm512_scalef_round_ph(A, B, C)                                \
+  (__builtin_ia32_scalefph512_mask_round ((A), (B),            \
+                                         _mm512_setzero_ph (), \
+                                         (__mmask32)-1, (C)))
+
+#define _mm512_mask_scalef_round_ph(A, B, C, D, E)                     \
+  (__builtin_ia32_scalefph512_mask_round ((C), (D), (A), (B), (E)))
+
+#define _mm512_maskz_scalef_round_ph(A, B, C, D)               \
+  (__builtin_ia32_scalefph512_mask_round ((B), (C),            \
+                                         _mm512_setzero_ph (), \
+                                         (A), (D)))
+
+#endif  /* __OPTIMIZE__ */
+
+/* Intrinsics vscalefsh.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_scalef_sh (__m128h __A, __m128h __B)
+{
+  return __builtin_ia32_scalefsh_mask_round (__A, __B,
+                                            _mm_setzero_ph (),
+                                            (__mmask8) -1,
+                                            _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_scalef_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return __builtin_ia32_scalefsh_mask_round (__C, __D, __A, __B,
+                                            _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_scalef_sh (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return __builtin_ia32_scalefsh_mask_round (__B, __C,
+                                            _mm_setzero_ph (),
+                                            __A,
+                                            _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_scalef_round_sh (__m128h __A, __m128h __B, const int __C)
+{
+  return __builtin_ia32_scalefsh_mask_round (__A, __B,
+                                            _mm_setzero_ph (),
+                                            (__mmask8) -1, __C);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_scalef_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
+                         __m128h __D, const int __E)
+{
+  return __builtin_ia32_scalefsh_mask_round (__C, __D, __A, __B,
+                                            __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_scalef_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
+                          const int __D)
+{
+  return __builtin_ia32_scalefsh_mask_round (__B, __C,
+                                            _mm_setzero_ph (),
+                                            __A, __D);
+}
+
+#else
+#define _mm_scalef_round_sh(A, B, C)                           \
+  (__builtin_ia32_scalefsh_mask_round ((A), (B),               \
+                                      _mm_setzero_ph (),       \
+                                      (__mmask8)-1, (C)))
+
+#define _mm_mask_scalef_round_sh(A, B, C, D, E)                                \
+  (__builtin_ia32_scalefsh_mask_round ((C), (D), (A), (B), (E)))
+
+#define _mm_maskz_scalef_round_sh(A, B, C, D)                          \
+  (__builtin_ia32_scalefsh_mask_round ((B), (C), _mm_setzero_ph (),    \
+                                      (A), (D)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vreduceph.  */
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_ph (__m512h __A, int __B)
+{
+  return __builtin_ia32_reduceph512_mask_round (__A, __B,
+                                               _mm512_setzero_ph (),
+                                               (__mmask32) -1,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_ph (__m512h __A, __mmask32 __B, __m512h __C, int __D)
+{
+  return __builtin_ia32_reduceph512_mask_round (__C, __D, __A, __B,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_reduce_ph (__mmask32 __A, __m512h __B, int __C)
+{
+  return __builtin_ia32_reduceph512_mask_round (__B, __C,
+                                               _mm512_setzero_ph (),
+                                               __A,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_round_ph (__m512h __A, int __B, const int __C)
+{
+  return __builtin_ia32_reduceph512_mask_round (__A, __B,
+                                               _mm512_setzero_ph (),
+                                               (__mmask32) -1, __C);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_reduce_round_ph (__m512h __A, __mmask32 __B, __m512h __C,
+                            int __D, const int __E)
+{
+  return __builtin_ia32_reduceph512_mask_round (__C, __D, __A, __B,
+                                               __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_reduce_round_ph (__mmask32 __A, __m512h __B, int __C,
+                             const int __D)
+{
+  return __builtin_ia32_reduceph512_mask_round (__B, __C,
+                                               _mm512_setzero_ph (),
+                                               __A, __D);
+}
+
+#else
+#define _mm512_reduce_ph(A, B)                                         \
+  (__builtin_ia32_reduceph512_mask_round ((A), (B),                    \
+                                         _mm512_setzero_ph (),         \
+                                         (__mmask32)-1,                \
+                                         _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_reduce_ph(A, B, C, D)                              \
+  (__builtin_ia32_reduceph512_mask_round ((C), (D), (A), (B),          \
+                                         _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_reduce_ph(A, B, C)                                        \
+  (__builtin_ia32_reduceph512_mask_round ((B), (C),                    \
+                                         _mm512_setzero_ph (),         \
+                                         (A), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_reduce_round_ph(A, B, C)                                \
+  (__builtin_ia32_reduceph512_mask_round ((A), (B),            \
+                                         _mm512_setzero_ph (), \
+                                         (__mmask32)-1, (C)))
+
+#define _mm512_mask_reduce_round_ph(A, B, C, D, E)                     \
+  (__builtin_ia32_reduceph512_mask_round ((C), (D), (A), (B), (E)))
+
+#define _mm512_maskz_reduce_round_ph(A, B, C, D)               \
+  (__builtin_ia32_reduceph512_mask_round ((B), (C),            \
+                                         _mm512_setzero_ph (), \
+                                         (A), (D)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vreducesh.  */
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_sh (__m128h __A, __m128h __B, int __C)
+{
+  return __builtin_ia32_reducesh_mask_round (__A, __B, __C,
+                                            _mm_setzero_ph (),
+                                            (__mmask8) -1,
+                                            _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_sh (__m128h __A, __mmask8 __B, __m128h __C,
+                   __m128h __D, int __E)
+{
+  return __builtin_ia32_reducesh_mask_round (__C, __D, __E, __A, __B,
+                                            _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_reduce_sh (__mmask8 __A, __m128h __B, __m128h __C, int __D)
+{
+  return __builtin_ia32_reducesh_mask_round (__B, __C, __D,
+                                            _mm_setzero_ph (), __A,
+                                            _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_round_sh (__m128h __A, __m128h __B, int __C, const int __D)
+{
+  return __builtin_ia32_reducesh_mask_round (__A, __B, __C,
+                                            _mm_setzero_ph (),
+                                            (__mmask8) -1, __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
+                         __m128h __D, int __E, const int __F)
+{
+  return __builtin_ia32_reducesh_mask_round (__C, __D, __E, __A,
+                                            __B, __F);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_reduce_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
+                          int __D, const int __E)
+{
+  return __builtin_ia32_reducesh_mask_round (__B, __C, __D,
+                                            _mm_setzero_ph (),
+                                            __A, __E);
+}
+
+#else
+#define _mm_reduce_sh(A, B, C)                                         \
+  (__builtin_ia32_reducesh_mask_round ((A), (B), (C),                  \
+                                      _mm_setzero_ph (),               \
+                                      (__mmask8)-1,                    \
+                                      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_reduce_sh(A, B, C, D, E)                              \
+  (__builtin_ia32_reducesh_mask_round ((C), (D), (E), (A), (B),                \
+                                      _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_reduce_sh(A, B, C, D)                                        \
+  (__builtin_ia32_reducesh_mask_round ((B), (C), (D),                  \
+                                      _mm_setzero_ph (),               \
+                                      (A), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_reduce_round_sh(A, B, C, D)                                \
+  (__builtin_ia32_reducesh_mask_round ((A), (B), (C),          \
+                                      _mm_setzero_ph (),       \
+                                      (__mmask8)-1, (D)))
+
+#define _mm_mask_reduce_round_sh(A, B, C, D, E, F)                     \
+  (__builtin_ia32_reducesh_mask_round ((C), (D), (E), (A), (B), (F)))
+
+#define _mm_maskz_reduce_round_sh(A, B, C, D, E)               \
+  (__builtin_ia32_reducesh_mask_round ((B), (C), (D),          \
+                                      _mm_setzero_ph (),       \
+                                      (A), (E)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vrndscaleph.  */
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_roundscale_ph (__m512h __A, int __B)
+{
+  return __builtin_ia32_rndscaleph512_mask_round (__A, __B,
+                                                 _mm512_setzero_ph (),
+                                                 (__mmask32) -1,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_roundscale_ph (__m512h __A, __mmask32 __B,
+                          __m512h __C, int __D)
+{
+  return __builtin_ia32_rndscaleph512_mask_round (__C, __D, __A, __B,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_roundscale_ph (__mmask32 __A, __m512h __B, int __C)
+{
+  return __builtin_ia32_rndscaleph512_mask_round (__B, __C,
+                                                 _mm512_setzero_ph (),
+                                                 __A,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_roundscale_round_ph (__m512h __A, int __B, const int __C)
+{
+  return __builtin_ia32_rndscaleph512_mask_round (__A, __B,
+                                                 _mm512_setzero_ph (),
+                                                 (__mmask32) -1,
+                                                 __C);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_roundscale_round_ph (__m512h __A, __mmask32 __B,
+                                __m512h __C, int __D, const int __E)
+{
+  return __builtin_ia32_rndscaleph512_mask_round (__C, __D, __A,
+                                                 __B, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_roundscale_round_ph (__mmask32 __A, __m512h __B, int __C,
+                                 const int __D)
+{
+  return __builtin_ia32_rndscaleph512_mask_round (__B, __C,
+                                                 _mm512_setzero_ph (),
+                                                 __A, __D);
+}
+
+#else
+#define _mm512_roundscale_ph(A, B)                                     \
+  (__builtin_ia32_rndscaleph512_mask_round ((A), (B),                  \
+                                           _mm512_setzero_ph (),       \
+                                           (__mmask32)-1,              \
+                                           _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_roundscale_ph(A, B, C, D)                          \
+  (__builtin_ia32_rndscaleph512_mask_round ((C), (D), (A), (B),                \
+                                           _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_maskz_roundscale_ph(A, B, C)                            \
+  (__builtin_ia32_rndscaleph512_mask_round ((B), (C),                  \
+                                           _mm512_setzero_ph (),       \
+                                           (A),                        \
+                                           _MM_FROUND_CUR_DIRECTION))
+#define _mm512_roundscale_round_ph(A, B, C)                            \
+  (__builtin_ia32_rndscaleph512_mask_round ((A), (B),                  \
+                                           _mm512_setzero_ph (),       \
+                                           (__mmask32)-1, (C)))
+
+#define _mm512_mask_roundscale_round_ph(A, B, C, D, E)                 \
+  (__builtin_ia32_rndscaleph512_mask_round ((C), (D), (A), (B), (E)))
+
+#define _mm512_maskz_roundscale_round_ph(A, B, C, D)                   \
+  (__builtin_ia32_rndscaleph512_mask_round ((B), (C),                  \
+                                           _mm512_setzero_ph (),       \
+                                           (A), (D)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vrndscalesh.  */
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roundscale_sh (__m128h __A, __m128h __B, int __C)
+{
+  return __builtin_ia32_rndscalesh_mask_round (__A, __B, __C,
+                                              _mm_setzero_ph (),
+                                              (__mmask8) -1,
+                                              _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_roundscale_sh (__m128h __A, __mmask8 __B, __m128h __C,
+                       __m128h __D, int __E)
+{
+  return __builtin_ia32_rndscalesh_mask_round (__C, __D, __E, __A, __B,
+                                              _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_roundscale_sh (__mmask8 __A, __m128h __B, __m128h __C, int __D)
+{
+  return __builtin_ia32_rndscalesh_mask_round (__B, __C, __D,
+                                              _mm_setzero_ph (), __A,
+                                              _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roundscale_round_sh (__m128h __A, __m128h __B, int __C, const int __D)
+{
+  return __builtin_ia32_rndscalesh_mask_round (__A, __B, __C,
+                                              _mm_setzero_ph (),
+                                              (__mmask8) -1,
+                                              __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_roundscale_round_sh (__m128h __A, __mmask8 __B, __m128h __C,
+                             __m128h __D, int __E, const int __F)
+{
+  return __builtin_ia32_rndscalesh_mask_round (__C, __D, __E,
+                                              __A, __B, __F);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_roundscale_round_sh (__mmask8 __A, __m128h __B, __m128h __C,
+                              int __D, const int __E)
+{
+  return __builtin_ia32_rndscalesh_mask_round (__B, __C, __D,
+                                              _mm_setzero_ph (),
+                                              __A, __E);
+}
+
+#else
+#define _mm_roundscale_sh(A, B, C)                                     \
+  (__builtin_ia32_rndscalesh_mask_round ((A), (B), (C),                        \
+                                        _mm_setzero_ph (),             \
+                                        (__mmask8)-1,                  \
+                                        _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_roundscale_sh(A, B, C, D, E)                          \
+  (__builtin_ia32_rndscalesh_mask_round ((C), (D), (E), (A), (B),      \
+                                        _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_roundscale_sh(A, B, C, D)                            \
+  (__builtin_ia32_rndscalesh_mask_round ((B), (C), (D),                        \
+                                        _mm_setzero_ph (),             \
+                                        (A), _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_roundscale_round_sh(A, B, C, D)                    \
+  (__builtin_ia32_rndscalesh_mask_round ((A), (B), (C),                \
+                                        _mm_setzero_ph (),     \
+                                        (__mmask8)-1, (D)))
+
+#define _mm_mask_roundscale_round_sh(A, B, C, D, E, F)                 \
+  (__builtin_ia32_rndscalesh_mask_round ((C), (D), (E), (A), (B), (F)))
+
+#define _mm_maskz_roundscale_round_sh(A, B, C, D, E)           \
+  (__builtin_ia32_rndscalesh_mask_round ((B), (C), (D),                \
+                                        _mm_setzero_ph (),     \
+                                        (A), (E)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vfpclasssh.  */
+#ifdef __OPTIMIZE__
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fpclass_sh_mask (__m128h __A, const int __imm)
+{
+  return (__mmask8) __builtin_ia32_fpclasssh_mask ((__v8hf) __A, __imm,
+                                                  (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fpclass_sh_mask (__mmask8 __U, __m128h __A, const int __imm)
+{
+  return (__mmask8) __builtin_ia32_fpclasssh_mask ((__v8hf) __A, __imm, __U);
+}
+
+#else
+#define _mm_fpclass_sh_mask(X, C)                                      \
+  ((__mmask8) __builtin_ia32_fpclasssh_mask ((__v8hf) (__m128h) (X),   \
+                                            (int) (C), (__mmask8) (-1))) \
+
+#define _mm_mask_fpclass_sh_mask(U, X, C)                              \
+  ((__mmask8) __builtin_ia32_fpclasssh_mask ((__v8hf) (__m128h) (X),   \
+                                            (int) (C), (__mmask8) (U)))
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vfpclassph.  */
+#ifdef __OPTIMIZE__
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fpclass_ph_mask (__mmask32 __U, __m512h __A,
+                            const int __imm)
+{
+  return (__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) __A,
+                                                      __imm, __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fpclass_ph_mask (__m512h __A, const int __imm)
+{
+  return (__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) __A,
+                                                      __imm,
+                                                      (__mmask32) -1);
+}
+
+#else
+#define _mm512_mask_fpclass_ph_mask(u, x, c)                           \
+  ((__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) (__m512h) (x), \
+                                                (int) (c),(__mmask8)(u)))
+
+#define _mm512_fpclass_ph_mask(x, c)                                    \
+  ((__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) (__m512h) (x), \
+                                                (int) (c),(__mmask8)-1))
+#endif /* __OPIMTIZE__ */
+
+/* Intrinsics vgetexpph, vgetexpsh.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getexp_sh (__m128h __A, __m128h __B)
+{
+  return (__m128h)
+    __builtin_ia32_getexpsh_mask_round ((__v8hf) __A, (__v8hf) __B,
+                                       (__v8hf) _mm_setzero_ph (),
+                                       (__mmask8) -1,
+                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getexp_sh (__m128h __W, __mmask8 __U, __m128h __A, __m128h __B)
+{
+  return (__m128h)
+    __builtin_ia32_getexpsh_mask_round ((__v8hf) __A, (__v8hf) __B,
+                                       (__v8hf) __W, (__mmask8) __U,
+                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getexp_sh (__mmask8 __U, __m128h __A, __m128h __B)
+{
+  return (__m128h)
+    __builtin_ia32_getexpsh_mask_round ((__v8hf) __A, (__v8hf) __B,
+                                       (__v8hf) _mm_setzero_ph (),
+                                       (__mmask8) __U,
+                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getexp_ph (__m512h __A)
+{
+  return (__m512h)
+    __builtin_ia32_getexpph512_mask ((__v32hf) __A,
+                                    (__v32hf) _mm512_setzero_ph (),
+                                    (__mmask32) -1, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getexp_ph (__m512h __W, __mmask32 __U, __m512h __A)
+{
+  return (__m512h)
+    __builtin_ia32_getexpph512_mask ((__v32hf) __A, (__v32hf) __W,
+                                    (__mmask32) __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getexp_ph (__mmask32 __U, __m512h __A)
+{
+  return (__m512h)
+    __builtin_ia32_getexpph512_mask ((__v32hf) __A,
+                                    (__v32hf) _mm512_setzero_ph (),
+                                    (__mmask32) __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getexp_round_sh (__m128h __A, __m128h __B, const int __R)
+{
+  return (__m128h) __builtin_ia32_getexpsh_mask_round ((__v8hf) __A,
+                                                      (__v8hf) __B,
+                                                      _mm_setzero_ph (),
+                                                      (__mmask8) -1,
+                                                      __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getexp_round_sh (__m128h __W, __mmask8 __U, __m128h __A,
+                         __m128h __B, const int __R)
+{
+  return (__m128h) __builtin_ia32_getexpsh_mask_round ((__v8hf) __A,
+                                                      (__v8hf) __B,
+                                                      (__v8hf) __W,
+                                                      (__mmask8) __U, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getexp_round_sh (__mmask8 __U, __m128h __A, __m128h __B,
+                          const int __R)
+{
+  return (__m128h) __builtin_ia32_getexpsh_mask_round ((__v8hf) __A,
+                                                      (__v8hf) __B,
+                                                      (__v8hf)
+                                                      _mm_setzero_ph (),
+                                                      (__mmask8) __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getexp_round_ph (__m512h __A, const int __R)
+{
+  return (__m512h) __builtin_ia32_getexpph512_mask ((__v32hf) __A,
+                                                   (__v32hf)
+                                                   _mm512_setzero_ph (),
+                                                   (__mmask32) -1, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getexp_round_ph (__m512h __W, __mmask32 __U, __m512h __A,
+                            const int __R)
+{
+  return (__m512h) __builtin_ia32_getexpph512_mask ((__v32hf) __A,
+                                                   (__v32hf) __W,
+                                                   (__mmask32) __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getexp_round_ph (__mmask32 __U, __m512h __A, const int __R)
+{
+  return (__m512h) __builtin_ia32_getexpph512_mask ((__v32hf) __A,
+                                                   (__v32hf)
+                                                   _mm512_setzero_ph (),
+                                                   (__mmask32) __U, __R);
+}
+
+#else
+#define _mm_getexp_round_sh(A, B, R)                                   \
+  ((__m128h)__builtin_ia32_getexpsh_mask_round((__v8hf)(__m128h)(A),   \
+                                              (__v8hf)(__m128h)(B),    \
+                                              (__v8hf)_mm_setzero_ph(), \
+                                              (__mmask8)-1, R))
+
+#define _mm_mask_getexp_round_sh(W, U, A, B, C)                        \
+  (__m128h)__builtin_ia32_getexpsh_mask_round(A, B, W, U, C)
+
+#define _mm_maskz_getexp_round_sh(U, A, B, C)                          \
+  (__m128h)__builtin_ia32_getexpsh_mask_round(A, B,                    \
+                                             (__v8hf)_mm_setzero_ph(), \
+                                             U, C)
+
+#define _mm512_getexp_round_ph(A, R)                                   \
+  ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A),     \
+                                           (__v32hf)_mm512_setzero_ph(), (__mmask32)-1, R))
+
+#define _mm512_mask_getexp_round_ph(W, U, A, R)                                \
+  ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A),     \
+                                           (__v32hf)(__m512h)(W), (__mmask32)(U), R))
+
+#define _mm512_maskz_getexp_round_ph(U, A, R)                          \
+  ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A),     \
+                                           (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), R))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vgetmantph, vgetmantsh.  */
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getmant_sh (__m128h __A, __m128h __B,
+               _MM_MANTISSA_NORM_ENUM __C,
+               _MM_MANTISSA_SIGN_ENUM __D)
+{
+  return (__m128h)
+    __builtin_ia32_getmantsh_mask_round ((__v8hf) __A, (__v8hf) __B,
+                                        (__D << 2) | __C, _mm_setzero_ph (),
+                                        (__mmask8) -1,
+                                        _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getmant_sh (__m128h __W, __mmask8 __U, __m128h __A,
+                    __m128h __B, _MM_MANTISSA_NORM_ENUM __C,
+                    _MM_MANTISSA_SIGN_ENUM __D)
+{
+  return (__m128h)
+    __builtin_ia32_getmantsh_mask_round ((__v8hf) __A, (__v8hf) __B,
+                                        (__D << 2) | __C, (__v8hf) __W,
+                                        __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getmant_sh (__mmask8 __U, __m128h __A, __m128h __B,
+                     _MM_MANTISSA_NORM_ENUM __C,
+                     _MM_MANTISSA_SIGN_ENUM __D)
+{
+  return (__m128h)
+    __builtin_ia32_getmantsh_mask_round ((__v8hf) __A, (__v8hf) __B,
+                                        (__D << 2) | __C,
+                                        (__v8hf) _mm_setzero_ph(),
+                                        __U, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getmant_ph (__m512h __A, _MM_MANTISSA_NORM_ENUM __B,
+                  _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A,
+                                                    (__C << 2) | __B,
+                                                    _mm512_setzero_ph (),
+                                                    (__mmask32) -1,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getmant_ph (__m512h __W, __mmask32 __U, __m512h __A,
+                       _MM_MANTISSA_NORM_ENUM __B,
+                       _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A,
+                                                    (__C << 2) | __B,
+                                                    (__v32hf) __W, __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getmant_ph (__mmask32 __U, __m512h __A,
+                        _MM_MANTISSA_NORM_ENUM __B,
+                        _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A,
+                                                    (__C << 2) | __B,
+                                                    (__v32hf)
+                                                    _mm512_setzero_ph (),
+                                                    __U,
+                                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getmant_round_sh (__m128h __A, __m128h __B,
+                     _MM_MANTISSA_NORM_ENUM __C,
+                     _MM_MANTISSA_SIGN_ENUM __D, const int __R)
+{
+  return (__m128h) __builtin_ia32_getmantsh_mask_round ((__v8hf) __A,
+                                                       (__v8hf) __B,
+                                                       (__D << 2) | __C,
+                                                       _mm_setzero_ph (),
+                                                       (__mmask8) -1,
+                                                       __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getmant_round_sh (__m128h __W, __mmask8 __U, __m128h __A,
+                          __m128h __B, _MM_MANTISSA_NORM_ENUM __C,
+                          _MM_MANTISSA_SIGN_ENUM __D, const int __R)
+{
+  return (__m128h) __builtin_ia32_getmantsh_mask_round ((__v8hf) __A,
+                                                       (__v8hf) __B,
+                                                       (__D << 2) | __C,
+                                                       (__v8hf) __W,
+                                                       __U, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getmant_round_sh (__mmask8 __U, __m128h __A, __m128h __B,
+                           _MM_MANTISSA_NORM_ENUM __C,
+                           _MM_MANTISSA_SIGN_ENUM __D, const int __R)
+{
+  return (__m128h) __builtin_ia32_getmantsh_mask_round ((__v8hf) __A,
+                                                       (__v8hf) __B,
+                                                       (__D << 2) | __C,
+                                                       (__v8hf)
+                                                       _mm_setzero_ph(),
+                                                       __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_getmant_round_ph (__m512h __A, _MM_MANTISSA_NORM_ENUM __B,
+                        _MM_MANTISSA_SIGN_ENUM __C, const int __R)
+{
+  return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A,
+                                                    (__C << 2) | __B,
+                                                    _mm512_setzero_ph (),
+                                                    (__mmask32) -1, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_getmant_round_ph (__m512h __W, __mmask32 __U, __m512h __A,
+                             _MM_MANTISSA_NORM_ENUM __B,
+                             _MM_MANTISSA_SIGN_ENUM __C, const int __R)
+{
+  return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A,
+                                                    (__C << 2) | __B,
+                                                    (__v32hf) __W, __U,
+                                                    __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_getmant_round_ph (__mmask32 __U, __m512h __A,
+                              _MM_MANTISSA_NORM_ENUM __B,
+                              _MM_MANTISSA_SIGN_ENUM __C, const int __R)
+{
+  return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A,
+                                                    (__C << 2) | __B,
+                                                    (__v32hf)
+                                                    _mm512_setzero_ph (),
+                                                    __U, __R);
+}
+
+#else
+#define _mm512_getmant_ph(X, B, C)                                     \
+  ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X),   \
+                                             (int)(((C)<<2) | (B)),    \
+                                             (__v32hf)(__m512h)        \
+                                             _mm512_setzero_ph(),      \
+                                             (__mmask32)-1,            \
+                                             _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_mask_getmant_ph(W, U, X, B, C)                          \
+  ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X),   \
+                                             (int)(((C)<<2) | (B)),    \
+                                             (__v32hf)(__m512h)(W),    \
+                                             (__mmask32)(U),           \
+                                             _MM_FROUND_CUR_DIRECTION))
+
+
+#define _mm512_maskz_getmant_ph(U, X, B, C)                            \
+  ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X),   \
+                                             (int)(((C)<<2) | (B)),    \
+                                             (__v32hf)(__m512h)        \
+                                             _mm512_setzero_ph(),      \
+                                             (__mmask32)(U),           \
+                                             _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_getmant_sh(X, Y, C, D)                                     \
+  ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X), \
+                                                (__v8hf)(__m128h)(Y),  \
+                                                (int)(((D)<<2) | (C)), \
+                                                (__v8hf)(__m128h)      \
+                                                _mm_setzero_ph (),     \
+                                                (__mmask8)-1,          \
+                                                _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_mask_getmant_sh(W, U, X, Y, C, D)                          \
+  ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X), \
+                                                (__v8hf)(__m128h)(Y),  \
+                                                (int)(((D)<<2) | (C)), \
+                                                (__v8hf)(__m128h)(W),  \
+                                                (__mmask8)(U),         \
+                                                _MM_FROUND_CUR_DIRECTION))
+
+#define _mm_maskz_getmant_sh(U, X, Y, C, D)                            \
+  ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X), \
+                                                (__v8hf)(__m128h)(Y),  \
+                                                (int)(((D)<<2) | (C)), \
+                                                (__v8hf)(__m128h)      \
+                                                _mm_setzero_ph(),      \
+                                                (__mmask8)(U),         \
+                                                _MM_FROUND_CUR_DIRECTION))
+
+#define _mm512_getmant_round_ph(X, B, C, R)                            \
+  ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X),   \
+                                             (int)(((C)<<2) | (B)),    \
+                                             (__v32hf)(__m512h)        \
+                                             _mm512_setzero_ph(),      \
+                                             (__mmask32)-1,            \
+                                             (R)))
+
+#define _mm512_mask_getmant_round_ph(W, U, X, B, C, R)                 \
+  ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X),   \
+                                             (int)(((C)<<2) | (B)),    \
+                                             (__v32hf)(__m512h)(W),    \
+                                             (__mmask32)(U),           \
+                                             (R)))
+
+
+#define _mm512_maskz_getmant_round_ph(U, X, B, C, R)                   \
+  ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X),   \
+                                             (int)(((C)<<2) | (B)),    \
+                                             (__v32hf)(__m512h)        \
+                                             _mm512_setzero_ph(),      \
+                                             (__mmask32)(U),           \
+                                             (R)))
+
+#define _mm_getmant_round_sh(X, Y, C, D, R)                            \
+  ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X), \
+                                                (__v8hf)(__m128h)(Y),  \
+                                                (int)(((D)<<2) | (C)), \
+                                                (__v8hf)(__m128h)      \
+                                                _mm_setzero_ph (),     \
+                                                (__mmask8)-1,          \
+                                                (R)))
+
+#define _mm_mask_getmant_round_sh(W, U, X, Y, C, D, R)                 \
+  ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X), \
+                                                (__v8hf)(__m128h)(Y),  \
+                                                (int)(((D)<<2) | (C)), \
+                                                (__v8hf)(__m128h)(W),  \
+                                                (__mmask8)(U),         \
+                                                (R)))
+
+#define _mm_maskz_getmant_round_sh(U, X, Y, C, D, R)                   \
+  ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X), \
+                                                (__v8hf)(__m128h)(Y),  \
+                                                (int)(((D)<<2) | (C)), \
+                                                (__v8hf)(__m128h)      \
+                                                _mm_setzero_ph(),      \
+                                                (__mmask8)(U),         \
+                                                (R)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vmovw.  */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi16_si128 (short __A)
+{
+  return _mm_set_epi16 (0, 0, 0, 0, 0, 0, 0, __A);
+}
+
+extern __inline short
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi128_si16 (__m128i __A)
+{
+  return __builtin_ia32_vec_ext_v8hi ((__v8hi)__A, 0);
+}
+
+/* Intrinsics vmovsh.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_load_sh (__m128h __A, __mmask8 __B, _Float16 const* __C)
+{
+  return __builtin_ia32_loadsh_mask (__C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_load_sh (__mmask8 __A, _Float16 const* __B)
+{
+  return __builtin_ia32_loadsh_mask (__B, _mm_setzero_ph (), __A);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_store_sh (_Float16 const* __A, __mmask8 __B, __m128h __C)
+{
+  __builtin_ia32_storesh_mask (__A,  __C, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_move_sh (__m128h __A, __m128h  __B)
+{
+  __A[0] = __B[0];
+  return __A;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_move_sh (__m128h __A, __mmask8 __B, __m128h  __C, __m128h __D)
+{
+  return __builtin_ia32_vmovsh_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_move_sh (__mmask8 __A, __m128h  __B, __m128h __C)
+{
+  return __builtin_ia32_vmovsh_mask (__B, __C, _mm_setzero_ph (), __A);
+}
+
+/* Intrinsics vcvtph2dq.  */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtph_epi32 (__m256h __A)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2dq512_mask_round (__A,
+                                           (__v16si)
+                                           _mm512_setzero_si512 (),
+                                           (__mmask16) -1,
+                                           _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtph_epi32 (__m512i __A, __mmask16 __B, __m256h __C)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2dq512_mask_round (__C,
+                                           (__v16si) __A,
+                                           __B,
+                                           _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtph_epi32 (__mmask16 __A, __m256h __B)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2dq512_mask_round (__B,
+                                           (__v16si)
+                                           _mm512_setzero_si512 (),
+                                           __A,
+                                           _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundph_epi32 (__m256h __A, int __B)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2dq512_mask_round (__A,
+                                           (__v16si)
+                                           _mm512_setzero_si512 (),
+                                           (__mmask16) -1,
+                                           __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundph_epi32 (__m512i __A, __mmask16 __B, __m256h __C, int __D)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2dq512_mask_round (__C,
+                                           (__v16si) __A,
+                                           __B,
+                                           __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundph_epi32 (__mmask16 __A, __m256h __B, int __C)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2dq512_mask_round (__B,
+                                           (__v16si)
+                                           _mm512_setzero_si512 (),
+                                           __A,
+                                           __C);
+}
+
+#else
+#define _mm512_cvt_roundph_epi32(A, B)                                 \
+  ((__m512i)                                                           \
+   __builtin_ia32_vcvtph2dq512_mask_round ((A),                                \
+                                          (__v16si)                    \
+                                          _mm512_setzero_si512 (),     \
+                                          (__mmask16)-1,               \
+                                          (B)))
+
+#define _mm512_mask_cvt_roundph_epi32(A, B, C, D)                      \
+  ((__m512i)                                                           \
+   __builtin_ia32_vcvtph2dq512_mask_round ((C), (__v16si)(A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundph_epi32(A, B, C)                                \
+  ((__m512i)                                                           \
+   __builtin_ia32_vcvtph2dq512_mask_round ((B),                                \
+                                          (__v16si)                    \
+                                          _mm512_setzero_si512 (),     \
+                                          (A),                         \
+                                          (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtph2udq.  */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtph_epu32 (__m256h __A)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2udq512_mask_round (__A,
+                                            (__v16si)
+                                            _mm512_setzero_si512 (),
+                                            (__mmask16) -1,
+                                            _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtph_epu32 (__m512i __A, __mmask16 __B, __m256h __C)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2udq512_mask_round (__C,
+                                            (__v16si) __A,
+                                            __B,
+                                            _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtph_epu32 (__mmask16 __A, __m256h __B)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2udq512_mask_round (__B,
+                                            (__v16si)
+                                            _mm512_setzero_si512 (),
+                                            __A,
+                                            _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundph_epu32 (__m256h __A, int __B)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2udq512_mask_round (__A,
+                                            (__v16si)
+                                            _mm512_setzero_si512 (),
+                                            (__mmask16) -1,
+                                            __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundph_epu32 (__m512i __A, __mmask16 __B, __m256h __C, int __D)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2udq512_mask_round (__C,
+                                            (__v16si) __A,
+                                            __B,
+                                            __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundph_epu32 (__mmask16 __A, __m256h __B, int __C)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2udq512_mask_round (__B,
+                                            (__v16si)
+                                            _mm512_setzero_si512 (),
+                                            __A,
+                                            __C);
+}
+
+#else
+#define _mm512_cvt_roundph_epu32(A, B)                                 \
+  ((__m512i)                                                           \
+   __builtin_ia32_vcvtph2udq512_mask_round ((A),                       \
+                                           (__v16si)                   \
+                                           _mm512_setzero_si512 (),    \
+                                           (__mmask16)-1,              \
+                                           (B)))
+
+#define _mm512_mask_cvt_roundph_epu32(A, B, C, D)                      \
+  ((__m512i)                                                           \
+   __builtin_ia32_vcvtph2udq512_mask_round ((C), (__v16si)(A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundph_epu32(A, B, C)                                \
+  ((__m512i)                                                           \
+   __builtin_ia32_vcvtph2udq512_mask_round ((B),                       \
+                                           (__v16si)                   \
+                                           _mm512_setzero_si512 (),    \
+                                           (A),                        \
+                                           (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvttph2dq.  */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttph_epi32 (__m256h __A)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2dq512_mask_round (__A,
+                                            (__v16si)
+                                            _mm512_setzero_si512 (),
+                                            (__mmask16) -1,
+                                            _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttph_epi32 (__m512i __A, __mmask16 __B, __m256h __C)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2dq512_mask_round (__C,
+                                            (__v16si) __A,
+                                            __B,
+                                            _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttph_epi32 (__mmask16 __A, __m256h __B)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2dq512_mask_round (__B,
+                                            (__v16si)
+                                            _mm512_setzero_si512 (),
+                                            __A,
+                                            _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundph_epi32 (__m256h __A, int __B)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2dq512_mask_round (__A,
+                                            (__v16si)
+                                            _mm512_setzero_si512 (),
+                                            (__mmask16) -1,
+                                            __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundph_epi32 (__m512i __A, __mmask16 __B,
+                               __m256h __C, int __D)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2dq512_mask_round (__C,
+                                            (__v16si) __A,
+                                            __B,
+                                            __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundph_epi32 (__mmask16 __A, __m256h __B, int __C)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2dq512_mask_round (__B,
+                                            (__v16si)
+                                            _mm512_setzero_si512 (),
+                                            __A,
+                                            __C);
+}
+
+#else
+#define _mm512_cvtt_roundph_epi32(A, B)                                        \
+  ((__m512i)                                                           \
+   __builtin_ia32_vcvttph2dq512_mask_round ((A),                       \
+                                           (__v16si)                   \
+                                           (_mm512_setzero_si512 ()),  \
+                                           (__mmask16)(-1), (B)))
+
+#define _mm512_mask_cvtt_roundph_epi32(A, B, C, D)             \
+  ((__m512i)                                                   \
+   __builtin_ia32_vcvttph2dq512_mask_round ((C),               \
+                                           (__v16si)(A),       \
+                                           (B),                \
+                                           (D)))
+
+#define _mm512_maskz_cvtt_roundph_epi32(A, B, C)                       \
+  ((__m512i)                                                           \
+   __builtin_ia32_vcvttph2dq512_mask_round ((B),                       \
+                                           (__v16si)                   \
+                                           _mm512_setzero_si512 (),    \
+                                           (A),                        \
+                                           (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvttph2udq.  */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttph_epu32 (__m256h __A)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2udq512_mask_round (__A,
+                                             (__v16si)
+                                             _mm512_setzero_si512 (),
+                                             (__mmask16) -1,
+                                             _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttph_epu32 (__m512i __A, __mmask16 __B, __m256h __C)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2udq512_mask_round (__C,
+                                             (__v16si) __A,
+                                             __B,
+                                             _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttph_epu32 (__mmask16 __A, __m256h __B)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2udq512_mask_round (__B,
+                                             (__v16si)
+                                             _mm512_setzero_si512 (),
+                                             __A,
+                                             _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundph_epu32 (__m256h __A, int __B)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2udq512_mask_round (__A,
+                                             (__v16si)
+                                             _mm512_setzero_si512 (),
+                                             (__mmask16) -1,
+                                             __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundph_epu32 (__m512i __A, __mmask16 __B,
+                               __m256h __C, int __D)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2udq512_mask_round (__C,
+                                             (__v16si) __A,
+                                             __B,
+                                             __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundph_epu32 (__mmask16 __A, __m256h __B, int __C)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2udq512_mask_round (__B,
+                                             (__v16si)
+                                             _mm512_setzero_si512 (),
+                                             __A,
+                                             __C);
+}
+
+#else
+#define _mm512_cvtt_roundph_epu32(A, B)                                        \
+  ((__m512i)                                                           \
+   __builtin_ia32_vcvttph2udq512_mask_round ((A),                      \
+                                            (__v16si)                  \
+                                            _mm512_setzero_si512 (),   \
+                                            (__mmask16)-1,             \
+                                            (B)))
+
+#define _mm512_mask_cvtt_roundph_epu32(A, B, C, D)             \
+  ((__m512i)                                                   \
+   __builtin_ia32_vcvttph2udq512_mask_round ((C),              \
+                                            (__v16si)(A),      \
+                                            (B),               \
+                                            (D)))
+
+#define _mm512_maskz_cvtt_roundph_epu32(A, B, C)                       \
+  ((__m512i)                                                           \
+   __builtin_ia32_vcvttph2udq512_mask_round ((B),                      \
+                                            (__v16si)                  \
+                                            _mm512_setzero_si512 (),   \
+                                            (A),                       \
+                                            (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtdq2ph.  */
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi32_ph (__m512i __A)
+{
+  return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __A,
+                                                _mm256_setzero_ph (),
+                                                (__mmask16) -1,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi32_ph (__m256h __A, __mmask16 __B, __m512i __C)
+{
+  return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __C,
+                                                __A,
+                                                __B,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi32_ph (__mmask16 __A, __m512i __B)
+{
+  return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __B,
+                                                _mm256_setzero_ph (),
+                                                __A,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepi32_ph (__m512i __A, int __B)
+{
+  return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __A,
+                                                _mm256_setzero_ph (),
+                                                (__mmask16) -1,
+                                                __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepi32_ph (__m256h __A, __mmask16 __B, __m512i __C, int __D)
+{
+  return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __C,
+                                                __A,
+                                                __B,
+                                                __D);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepi32_ph (__mmask16 __A, __m512i __B, int __C)
+{
+  return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __B,
+                                                _mm256_setzero_ph (),
+                                                __A,
+                                                __C);
+}
+
+#else
+#define _mm512_cvt_roundepi32_ph(A, B)                                 \
+  (__builtin_ia32_vcvtdq2ph512_mask_round ((__v16si)(A),               \
+                                          _mm256_setzero_ph (),        \
+                                          (__mmask16)-1,               \
+                                          (B)))
+
+#define _mm512_mask_cvt_roundepi32_ph(A, B, C, D)              \
+  (__builtin_ia32_vcvtdq2ph512_mask_round ((__v16si)(C),       \
+                                          (A),                 \
+                                          (B),                 \
+                                          (D)))
+
+#define _mm512_maskz_cvt_roundepi32_ph(A, B, C)                                \
+  (__builtin_ia32_vcvtdq2ph512_mask_round ((__v16si)(B),               \
+                                          _mm256_setzero_ph (),        \
+                                          (A),                         \
+                                          (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtudq2ph.  */
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu32_ph (__m512i __A)
+{
+  return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __A,
+                                                 _mm256_setzero_ph (),
+                                                 (__mmask16) -1,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu32_ph (__m256h __A, __mmask16 __B, __m512i __C)
+{
+  return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __C,
+                                                 __A,
+                                                 __B,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu32_ph (__mmask16 __A, __m512i __B)
+{
+  return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __B,
+                                                 _mm256_setzero_ph (),
+                                                 __A,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepu32_ph (__m512i __A, int __B)
+{
+  return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __A,
+                                                 _mm256_setzero_ph (),
+                                                 (__mmask16) -1,
+                                                 __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepu32_ph (__m256h __A, __mmask16 __B, __m512i __C, int __D)
+{
+  return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __C,
+                                                 __A,
+                                                 __B,
+                                                 __D);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepu32_ph (__mmask16 __A, __m512i __B, int __C)
+{
+  return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __B,
+                                                 _mm256_setzero_ph (),
+                                                 __A,
+                                                 __C);
+}
+
+#else
+#define _mm512_cvt_roundepu32_ph(A, B)                                 \
+  (__builtin_ia32_vcvtudq2ph512_mask_round ((__v16si)(A),              \
+                                           _mm256_setzero_ph (),       \
+                                           (__mmask16)-1,              \
+                                           B))
+
+#define _mm512_mask_cvt_roundepu32_ph(A, B, C, D)      \
+  (__builtin_ia32_vcvtudq2ph512_mask_round ((__v16si)C,        \
+                                           A,          \
+                                           B,          \
+                                           D))
+
+#define _mm512_maskz_cvt_roundepu32_ph(A, B, C)                                \
+  (__builtin_ia32_vcvtudq2ph512_mask_round ((__v16si)B,                        \
+                                           _mm256_setzero_ph (),       \
+                                           A,                          \
+                                           C))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtph2qq.  */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtph_epi64 (__m128h __A)
+{
+  return __builtin_ia32_vcvtph2qq512_mask_round (__A,
+                                                _mm512_setzero_si512 (),
+                                                (__mmask8) -1,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtph_epi64 (__m512i __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvtph2qq512_mask_round (__C, __A, __B,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtph_epi64 (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvtph2qq512_mask_round (__B,
+                                                _mm512_setzero_si512 (),
+                                                __A,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundph_epi64 (__m128h __A, int __B)
+{
+  return __builtin_ia32_vcvtph2qq512_mask_round (__A,
+                                                _mm512_setzero_si512 (),
+                                                (__mmask8) -1,
+                                                __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundph_epi64 (__m512i __A, __mmask8 __B, __m128h __C, int __D)
+{
+  return __builtin_ia32_vcvtph2qq512_mask_round (__C, __A, __B, __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundph_epi64 (__mmask8 __A, __m128h __B, int __C)
+{
+  return __builtin_ia32_vcvtph2qq512_mask_round (__B,
+                                                _mm512_setzero_si512 (),
+                                                __A,
+                                                __C);
+}
+
+#else
+#define _mm512_cvt_roundph_epi64(A, B)                                 \
+  (__builtin_ia32_vcvtph2qq512_mask_round ((A),                                \
+                                          _mm512_setzero_si512 (),     \
+                                          (__mmask8)-1,                \
+                                          (B)))
+
+#define _mm512_mask_cvt_roundph_epi64(A, B, C, D)              \
+  (__builtin_ia32_vcvtph2qq512_mask_round ((C), (A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundph_epi64(A, B, C)                                \
+  (__builtin_ia32_vcvtph2qq512_mask_round ((B),                                \
+                                          _mm512_setzero_si512 (),     \
+                                          (A),                         \
+                                          (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtph2uqq.  */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtph_epu64 (__m128h __A)
+{
+  return __builtin_ia32_vcvtph2uqq512_mask_round (__A,
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask8) -1,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtph_epu64 (__m512i __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvtph2uqq512_mask_round (__C, __A, __B,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtph_epu64 (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvtph2uqq512_mask_round (__B,
+                                                 _mm512_setzero_si512 (),
+                                                 __A,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundph_epu64 (__m128h __A, int __B)
+{
+  return __builtin_ia32_vcvtph2uqq512_mask_round (__A,
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask8) -1,
+                                                 __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundph_epu64 (__m512i __A, __mmask8 __B, __m128h __C, int __D)
+{
+  return __builtin_ia32_vcvtph2uqq512_mask_round (__C, __A, __B, __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundph_epu64 (__mmask8 __A, __m128h __B, int __C)
+{
+  return __builtin_ia32_vcvtph2uqq512_mask_round (__B,
+                                                 _mm512_setzero_si512 (),
+                                                 __A,
+                                                 __C);
+}
+
+#else
+#define _mm512_cvt_roundph_epu64(A, B)                                 \
+  (__builtin_ia32_vcvtph2uqq512_mask_round ((A),                       \
+                                           _mm512_setzero_si512 (),    \
+                                           (__mmask8)-1,               \
+                                           (B)))
+
+#define _mm512_mask_cvt_roundph_epu64(A, B, C, D)                      \
+  (__builtin_ia32_vcvtph2uqq512_mask_round ((C), (A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundph_epu64(A, B, C)                                \
+  (__builtin_ia32_vcvtph2uqq512_mask_round ((B),                       \
+                                           _mm512_setzero_si512 (),    \
+                                           (A),                        \
+                                           (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvttph2qq.  */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttph_epi64 (__m128h __A)
+{
+  return __builtin_ia32_vcvttph2qq512_mask_round (__A,
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask8) -1,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttph_epi64 (__m512i __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvttph2qq512_mask_round (__C, __A, __B,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttph_epi64 (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvttph2qq512_mask_round (__B,
+                                                 _mm512_setzero_si512 (),
+                                                 __A,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundph_epi64 (__m128h __A, int __B)
+{
+  return __builtin_ia32_vcvttph2qq512_mask_round (__A,
+                                                 _mm512_setzero_si512 (),
+                                                 (__mmask8) -1,
+                                                 __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundph_epi64 (__m512i __A, __mmask8 __B, __m128h __C, int __D)
+{
+  return __builtin_ia32_vcvttph2qq512_mask_round (__C, __A, __B, __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundph_epi64 (__mmask8 __A, __m128h __B, int __C)
+{
+  return __builtin_ia32_vcvttph2qq512_mask_round (__B,
+                                                 _mm512_setzero_si512 (),
+                                                 __A,
+                                                 __C);
+}
+
+#else
+#define _mm512_cvtt_roundph_epi64(A, B)                                        \
+  (__builtin_ia32_vcvttph2qq512_mask_round ((A),                       \
+                                           _mm512_setzero_si512 (),    \
+                                           (__mmask8)-1,               \
+                                           (B)))
+
+#define _mm512_mask_cvtt_roundph_epi64(A, B, C, D)                     \
+  __builtin_ia32_vcvttph2qq512_mask_round ((C), (A), (B), (D))
+
+#define _mm512_maskz_cvtt_roundph_epi64(A, B, C)                       \
+  (__builtin_ia32_vcvttph2qq512_mask_round ((B),                       \
+                                           _mm512_setzero_si512 (),    \
+                                           (A),                        \
+                                           (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvttph2uqq.  */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttph_epu64 (__m128h __A)
+{
+  return __builtin_ia32_vcvttph2uqq512_mask_round (__A,
+                                                  _mm512_setzero_si512 (),
+                                                  (__mmask8) -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttph_epu64 (__m512i __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvttph2uqq512_mask_round (__C, __A, __B,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttph_epu64 (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvttph2uqq512_mask_round (__B,
+                                                  _mm512_setzero_si512 (),
+                                                  __A,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundph_epu64 (__m128h __A, int __B)
+{
+  return __builtin_ia32_vcvttph2uqq512_mask_round (__A,
+                                                  _mm512_setzero_si512 (),
+                                                  (__mmask8) -1,
+                                                  __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundph_epu64 (__m512i __A, __mmask8 __B, __m128h __C, int __D)
+{
+  return __builtin_ia32_vcvttph2uqq512_mask_round (__C, __A, __B, __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundph_epu64 (__mmask8 __A, __m128h __B, int __C)
+{
+  return __builtin_ia32_vcvttph2uqq512_mask_round (__B,
+                                                  _mm512_setzero_si512 (),
+                                                  __A,
+                                                  __C);
+}
+
+#else
+#define _mm512_cvtt_roundph_epu64(A, B)                                        \
+  (__builtin_ia32_vcvttph2uqq512_mask_round ((A),                      \
+                                            _mm512_setzero_si512 (),   \
+                                            (__mmask8)-1,              \
+                                            (B)))
+
+#define _mm512_mask_cvtt_roundph_epu64(A, B, C, D)                     \
+  __builtin_ia32_vcvttph2uqq512_mask_round ((C), (A), (B), (D))
+
+#define _mm512_maskz_cvtt_roundph_epu64(A, B, C)                       \
+  (__builtin_ia32_vcvttph2uqq512_mask_round ((B),                      \
+                                            _mm512_setzero_si512 (),   \
+                                            (A),                       \
+                                            (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtqq2ph.  */
+extern __inline __m128h
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi64_ph (__m512i __A)
+{
+  return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __A,
+                                                _mm_setzero_ph (),
+                                                (__mmask8) -1,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi64_ph (__m128h __A, __mmask8 __B, __m512i __C)
+{
+  return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __C,
+                                                __A,
+                                                __B,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi64_ph (__mmask8 __A, __m512i __B)
+{
+  return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __B,
+                                                _mm_setzero_ph (),
+                                                __A,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepi64_ph (__m512i __A, int __B)
+{
+  return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __A,
+                                                _mm_setzero_ph (),
+                                                (__mmask8) -1,
+                                                __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepi64_ph (__m128h __A, __mmask8 __B, __m512i __C, int __D)
+{
+  return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __C,
+                                                __A,
+                                                __B,
+                                                __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepi64_ph (__mmask8 __A, __m512i __B, int __C)
+{
+  return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __B,
+                                                _mm_setzero_ph (),
+                                                __A,
+                                                __C);
+}
+
+#else
+#define _mm512_cvt_roundepi64_ph(A, B)                         \
+  (__builtin_ia32_vcvtqq2ph512_mask_round ((__v8di)(A),                \
+                                          _mm_setzero_ph (),   \
+                                          (__mmask8)-1,        \
+                                          (B)))
+
+#define _mm512_mask_cvt_roundepi64_ph(A, B, C, D)                      \
+  (__builtin_ia32_vcvtqq2ph512_mask_round ((__v8di)(C), (A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundepi64_ph(A, B, C)                        \
+  (__builtin_ia32_vcvtqq2ph512_mask_round ((__v8di)(B),                \
+                                          _mm_setzero_ph (),   \
+                                          (A),                 \
+                                          (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtuqq2ph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepu64_ph (__m512i __A)
+{
+  return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __A,
+                                                 _mm_setzero_ph (),
+                                                 (__mmask8) -1,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu64_ph (__m128h __A, __mmask8 __B, __m512i __C)
+{
+  return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __C,
+                                                 __A,
+                                                 __B,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu64_ph (__mmask8 __A, __m512i __B)
+{
+  return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __B,
+                                                 _mm_setzero_ph (),
+                                                 __A,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepu64_ph (__m512i __A, int __B)
+{
+  return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __A,
+                                                 _mm_setzero_ph (),
+                                                 (__mmask8) -1,
+                                                 __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepu64_ph (__m128h __A, __mmask8 __B, __m512i __C, int __D)
+{
+  return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __C,
+                                                 __A,
+                                                 __B,
+                                                 __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepu64_ph (__mmask8 __A, __m512i __B, int __C)
+{
+  return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __B,
+                                                 _mm_setzero_ph (),
+                                                 __A,
+                                                 __C);
+}
+
+#else
+#define _mm512_cvt_roundepu64_ph(A, B)                         \
+  (__builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di)(A),       \
+                                           _mm_setzero_ph (),  \
+                                           (__mmask8)-1,       \
+                                           (B)))
+
+#define _mm512_mask_cvt_roundepu64_ph(A, B, C, D)                      \
+  (__builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di)(C), (A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundepu64_ph(A, B, C)                        \
+  (__builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di)(B),       \
+                                           _mm_setzero_ph (),  \
+                                           (A),                \
+                                           (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtph2w.  */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtph_epi16 (__m512h __A)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2w512_mask_round (__A,
+                                             (__v32hi)
+                                             _mm512_setzero_si512 (),
+                                             (__mmask32) -1,
+                                             _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtph_epi16 (__m512i __A, __mmask32 __B, __m512h __C)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2w512_mask_round (__C,
+                                             (__v32hi) __A,
+                                             __B,
+                                             _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtph_epi16 (__mmask32 __A, __m512h __B)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2w512_mask_round (__B,
+                                             (__v32hi)
+                                             _mm512_setzero_si512 (),
+                                             __A,
+                                             _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundph_epi16 (__m512h __A, int __B)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2w512_mask_round (__A,
+                                             (__v32hi)
+                                             _mm512_setzero_si512 (),
+                                             (__mmask32) -1,
+                                             __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundph_epi16 (__m512i __A, __mmask32 __B, __m512h __C, int __D)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2w512_mask_round (__C,
+                                             (__v32hi) __A,
+                                             __B,
+                                             __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundph_epi16 (__mmask32 __A, __m512h __B, int __C)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2w512_mask_round (__B,
+                                             (__v32hi)
+                                             _mm512_setzero_si512 (),
+                                             __A,
+                                             __C);
+}
+
+#else
+#define _mm512_cvt_roundph_epi16(A, B)                                 \
+  ((__m512i)__builtin_ia32_vcvtph2w512_mask_round ((A),                \
+                                                     (__v32hi)         \
+                                                     _mm512_setzero_si512 (), \
+                                                     (__mmask32)-1,    \
+                                                     (B)))
+
+#define _mm512_mask_cvt_roundph_epi16(A, B, C, D)                      \
+  ((__m512i)__builtin_ia32_vcvtph2w512_mask_round ((C),                \
+                                                     (__v32hi)(A),     \
+                                                     (B),              \
+                                                     (D)))
+
+#define _mm512_maskz_cvt_roundph_epi16(A, B, C)                                \
+  ((__m512i)__builtin_ia32_vcvtph2w512_mask_round ((B),                \
+                                                     (__v32hi)         \
+                                                     _mm512_setzero_si512 (), \
+                                                     (A),              \
+                                                     (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtph2uw.  */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtph_epu16 (__m512h __A)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2uw512_mask_round (__A,
+                                              (__v32hi)
+                                              _mm512_setzero_si512 (),
+                                              (__mmask32) -1,
+                                              _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtph_epu16 (__m512i __A, __mmask32 __B, __m512h __C)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2uw512_mask_round (__C, (__v32hi) __A, __B,
+                                              _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtph_epu16 (__mmask32 __A, __m512h __B)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2uw512_mask_round (__B,
+                                              (__v32hi)
+                                              _mm512_setzero_si512 (),
+                                              __A,
+                                              _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundph_epu16 (__m512h __A, int __B)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2uw512_mask_round (__A,
+                                              (__v32hi)
+                                              _mm512_setzero_si512 (),
+                                              (__mmask32) -1,
+                                              __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundph_epu16 (__m512i __A, __mmask32 __B, __m512h __C, int __D)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2uw512_mask_round (__C, (__v32hi) __A, __B, __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundph_epu16 (__mmask32 __A, __m512h __B, int __C)
+{
+  return (__m512i)
+    __builtin_ia32_vcvtph2uw512_mask_round (__B,
+                                              (__v32hi)
+                                              _mm512_setzero_si512 (),
+                                              __A,
+                                              __C);
+}
+
+#else
+#define _mm512_cvt_roundph_epu16(A, B)                                 \
+  ((__m512i)                                                           \
+   __builtin_ia32_vcvtph2uw512_mask_round ((A),                        \
+                                             (__v32hi)                 \
+                                             _mm512_setzero_si512 (),  \
+                                             (__mmask32)-1, (B)))
+
+#define _mm512_mask_cvt_roundph_epu16(A, B, C, D)                      \
+  ((__m512i)                                                           \
+   __builtin_ia32_vcvtph2uw512_mask_round ((C), (__v32hi)(A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundph_epu16(A, B, C)                                \
+  ((__m512i)                                                           \
+   __builtin_ia32_vcvtph2uw512_mask_round ((B),                        \
+                                             (__v32hi)                 \
+                                             _mm512_setzero_si512 (),  \
+                                             (A),                      \
+                                             (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvttph2w.  */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttph_epi16 (__m512h __A)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2w512_mask_round (__A,
+                                           (__v32hi)
+                                           _mm512_setzero_si512 (),
+                                           (__mmask32) -1,
+                                           _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttph_epi16 (__m512i __A, __mmask32 __B, __m512h __C)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2w512_mask_round (__C,
+                                           (__v32hi) __A,
+                                           __B,
+                                           _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttph_epi16 (__mmask32 __A, __m512h __B)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2w512_mask_round (__B,
+                                           (__v32hi)
+                                           _mm512_setzero_si512 (),
+                                           __A,
+                                           _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundph_epi16 (__m512h __A, int __B)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2w512_mask_round (__A,
+                                           (__v32hi)
+                                           _mm512_setzero_si512 (),
+                                           (__mmask32) -1,
+                                           __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundph_epi16 (__m512i __A, __mmask32 __B,
+                               __m512h __C, int __D)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2w512_mask_round (__C,
+                                           (__v32hi) __A,
+                                           __B,
+                                           __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundph_epi16 (__mmask32 __A, __m512h __B, int __C)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2w512_mask_round (__B,
+                                           (__v32hi)
+                                           _mm512_setzero_si512 (),
+                                           __A,
+                                           __C);
+}
+
+#else
+#define _mm512_cvtt_roundph_epi16(A, B)                                    \
+  ((__m512i)                                                       \
+   __builtin_ia32_vcvttph2w512_mask_round ((A),                            \
+                                          (__v32hi)                \
+                                          _mm512_setzero_si512 (), \
+                                          (__mmask32)-1,           \
+                                          (B)))
+
+#define _mm512_mask_cvtt_roundph_epi16(A, B, C, D)             \
+  ((__m512i)                                                   \
+   __builtin_ia32_vcvttph2w512_mask_round ((C),                        \
+                                          (__v32hi)(A),        \
+                                          (B),                 \
+                                          (D)))
+
+#define _mm512_maskz_cvtt_roundph_epi16(A, B, C)                   \
+  ((__m512i)                                                       \
+   __builtin_ia32_vcvttph2w512_mask_round ((B),                            \
+                                          (__v32hi)                \
+                                          _mm512_setzero_si512 (), \
+                                          (A),                     \
+                                          (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvttph2uw.  */
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvttph_epu16 (__m512h __A)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2uw512_mask_round (__A,
+                                            (__v32hi)
+                                            _mm512_setzero_si512 (),
+                                            (__mmask32) -1,
+                                            _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvttph_epu16 (__m512i __A, __mmask32 __B, __m512h __C)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2uw512_mask_round (__C,
+                                            (__v32hi) __A,
+                                            __B,
+                                            _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvttph_epu16 (__mmask32 __A, __m512h __B)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2uw512_mask_round (__B,
+                                            (__v32hi)
+                                            _mm512_setzero_si512 (),
+                                            __A,
+                                            _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtt_roundph_epu16 (__m512h __A, int __B)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2uw512_mask_round (__A,
+                                            (__v32hi)
+                                            _mm512_setzero_si512 (),
+                                            (__mmask32) -1,
+                                            __B);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtt_roundph_epu16 (__m512i __A, __mmask32 __B,
+                               __m512h __C, int __D)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2uw512_mask_round (__C,
+                                            (__v32hi) __A,
+                                            __B,
+                                            __D);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtt_roundph_epu16 (__mmask32 __A, __m512h __B, int __C)
+{
+  return (__m512i)
+    __builtin_ia32_vcvttph2uw512_mask_round (__B,
+                                            (__v32hi)
+                                            _mm512_setzero_si512 (),
+                                            __A,
+                                            __C);
+}
+
+#else
+#define _mm512_cvtt_roundph_epu16(A, B)                                     \
+  ((__m512i)                                                        \
+   __builtin_ia32_vcvttph2uw512_mask_round ((A),                    \
+                                           (__v32hi)                \
+                                           _mm512_setzero_si512 (), \
+                                           (__mmask32)-1,           \
+                                           (B)))
+
+#define _mm512_mask_cvtt_roundph_epu16(A, B, C, D)             \
+  ((__m512i)                                                   \
+   __builtin_ia32_vcvttph2uw512_mask_round ((C),               \
+                                           (__v32hi)(A),       \
+                                           (B),                \
+                                           (D)))
+
+#define _mm512_maskz_cvtt_roundph_epu16(A, B, C)                    \
+  ((__m512i)                                                        \
+   __builtin_ia32_vcvttph2uw512_mask_round ((B),                    \
+                                           (__v32hi)                \
+                                           _mm512_setzero_si512 (), \
+                                           (A),                     \
+                                           (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtw2ph.  */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtepi16_ph (__m512i __A)
+{
+  return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __A,
+                                               _mm512_setzero_ph (),
+                                               (__mmask32) -1,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepi16_ph (__m512h __A, __mmask32 __B, __m512i __C)
+{
+  return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __C,
+                                               __A,
+                                               __B,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepi16_ph (__mmask32 __A, __m512i __B)
+{
+  return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __B,
+                                               _mm512_setzero_ph (),
+                                               __A,
+                                               _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepi16_ph (__m512i __A, int __B)
+{
+  return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __A,
+                                               _mm512_setzero_ph (),
+                                               (__mmask32) -1,
+                                               __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepi16_ph (__m512h __A, __mmask32 __B, __m512i __C, int __D)
+{
+  return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __C,
+                                               __A,
+                                               __B,
+                                               __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepi16_ph (__mmask32 __A, __m512i __B, int __C)
+{
+  return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __B,
+                                               _mm512_setzero_ph (),
+                                               __A,
+                                               __C);
+}
+
+#else
+#define _mm512_cvt_roundepi16_ph(A, B)                         \
+  (__builtin_ia32_vcvtw2ph512_mask_round ((__v32hi)(A),                \
+                                         _mm512_setzero_ph (), \
+                                         (__mmask32)-1,        \
+                                         (B)))
+
+#define _mm512_mask_cvt_roundepi16_ph(A, B, C, D)      \
+  (__builtin_ia32_vcvtw2ph512_mask_round ((__v32hi)(C),        \
+                                         (A),          \
+                                         (B),          \
+                                         (D)))
+
+#define _mm512_maskz_cvt_roundepi16_ph(A, B, C)                        \
+  (__builtin_ia32_vcvtw2ph512_mask_round ((__v32hi)(B),                \
+                                         _mm512_setzero_ph (), \
+                                         (A),                  \
+                                         (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtuw2ph.  */
+  extern __inline __m512h
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+  _mm512_cvtepu16_ph (__m512i __A)
+  {
+    return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __A,
+                                                  _mm512_setzero_ph (),
+                                                  (__mmask32) -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+  }
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtepu16_ph (__m512h __A, __mmask32 __B, __m512i __C)
+{
+  return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __C,
+                                                __A,
+                                                __B,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtepu16_ph (__mmask32 __A, __m512i __B)
+{
+  return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __B,
+                                                _mm512_setzero_ph (),
+                                                __A,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundepu16_ph (__m512i __A, int __B)
+{
+  return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __A,
+                                                _mm512_setzero_ph (),
+                                                (__mmask32) -1,
+                                                __B);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundepu16_ph (__m512h __A, __mmask32 __B, __m512i __C, int __D)
+{
+  return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __C,
+                                                __A,
+                                                __B,
+                                                __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundepu16_ph (__mmask32 __A, __m512i __B, int __C)
+{
+  return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __B,
+                                                _mm512_setzero_ph (),
+                                                __A,
+                                                __C);
+}
+
+#else
+#define _mm512_cvt_roundepu16_ph(A, B)                                 \
+  (__builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi)(A),               \
+                                          _mm512_setzero_ph (),        \
+                                          (__mmask32)-1,               \
+                                          (B)))
+
+#define _mm512_mask_cvt_roundepu16_ph(A, B, C, D)              \
+  (__builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi)(C),       \
+                                          (A),                 \
+                                          (B),                 \
+                                          (D)))
+
+#define _mm512_maskz_cvt_roundepu16_ph(A, B, C)                                \
+  (__builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi)(B),               \
+                                          _mm512_setzero_ph (),        \
+                                          (A),                         \
+                                          (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtsh2si, vcvtsh2us.  */
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsh_i32 (__m128h __A)
+{
+  return (int) __builtin_ia32_vcvtsh2si32_round (__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsh_u32 (__m128h __A)
+{
+  return (int) __builtin_ia32_vcvtsh2usi32_round (__A,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsh_i32 (__m128h __A, const int __R)
+{
+  return (int) __builtin_ia32_vcvtsh2si32_round (__A, __R);
+}
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsh_u32 (__m128h __A, const int __R)
+{
+  return (int) __builtin_ia32_vcvtsh2usi32_round (__A, __R);
+}
+
+#else
+#define _mm_cvt_roundsh_i32(A, B)              \
+  ((int)__builtin_ia32_vcvtsh2si32_round ((A), (B)))
+#define _mm_cvt_roundsh_u32(A, B)              \
+  ((int)__builtin_ia32_vcvtsh2usi32_round ((A), (B)))
+
+#endif /* __OPTIMIZE__ */
+
+#ifdef __x86_64__
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsh_i64 (__m128h __A)
+{
+  return (long long)
+    __builtin_ia32_vcvtsh2si64_round (__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsh_u64 (__m128h __A)
+{
+  return (long long)
+    __builtin_ia32_vcvtsh2usi64_round (__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsh_i64 (__m128h __A, const int __R)
+{
+  return (long long) __builtin_ia32_vcvtsh2si64_round (__A, __R);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsh_u64 (__m128h __A, const int __R)
+{
+  return (long long) __builtin_ia32_vcvtsh2usi64_round (__A, __R);
+}
+
+#else
+#define _mm_cvt_roundsh_i64(A, B)                      \
+  ((long long)__builtin_ia32_vcvtsh2si64_round ((A), (B)))
+#define _mm_cvt_roundsh_u64(A, B)                      \
+  ((long long)__builtin_ia32_vcvtsh2usi64_round ((A), (B)))
+
+#endif /* __OPTIMIZE__ */
+#endif /* __x86_64__ */
+
+/* Intrinsics vcvttsh2si, vcvttsh2us.  */
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsh_i32 (__m128h __A)
+{
+  return (int)
+    __builtin_ia32_vcvttsh2si32_round (__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsh_u32 (__m128h __A)
+{
+  return (int)
+    __builtin_ia32_vcvttsh2usi32_round (__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsh_i32 (__m128h __A, const int __R)
+{
+  return (int) __builtin_ia32_vcvttsh2si32_round (__A, __R);
+}
+
+extern __inline unsigned
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsh_u32 (__m128h __A, const int __R)
+{
+  return (int) __builtin_ia32_vcvttsh2usi32_round (__A, __R);
+}
+
+#else
+#define _mm_cvtt_roundsh_i32(A, B)             \
+  ((int)__builtin_ia32_vcvttsh2si32_round ((A), (B)))
+#define _mm_cvtt_roundsh_u32(A, B)             \
+  ((int)__builtin_ia32_vcvttsh2usi32_round ((A), (B)))
+
+#endif /* __OPTIMIZE__ */
+
+#ifdef __x86_64__
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsh_i64 (__m128h __A)
+{
+  return (long long)
+    __builtin_ia32_vcvttsh2si64_round (__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsh_u64 (__m128h __A)
+{
+  return (long long)
+    __builtin_ia32_vcvttsh2usi64_round (__A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsh_i64 (__m128h __A, const int __R)
+{
+  return (long long) __builtin_ia32_vcvttsh2si64_round (__A, __R);
+}
+
+extern __inline unsigned long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_roundsh_u64 (__m128h __A, const int __R)
+{
+  return (long long) __builtin_ia32_vcvttsh2usi64_round (__A, __R);
+}
+
+#else
+#define _mm_cvtt_roundsh_i64(A, B)                     \
+  ((long long)__builtin_ia32_vcvttsh2si64_round ((A), (B)))
+#define _mm_cvtt_roundsh_u64(A, B)                     \
+  ((long long)__builtin_ia32_vcvttsh2usi64_round ((A), (B)))
+
+#endif /* __OPTIMIZE__ */
+#endif /* __x86_64__ */
+
+/* Intrinsics vcvtsi2sh, vcvtusi2sh.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvti32_sh (__m128h __A, int __B)
+{
+  return __builtin_ia32_vcvtsi2sh32_round (__A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtu32_sh (__m128h __A, unsigned int __B)
+{
+  return __builtin_ia32_vcvtusi2sh32_round (__A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundi32_sh (__m128h __A, int __B, const int __R)
+{
+  return __builtin_ia32_vcvtsi2sh32_round (__A, __B, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundu32_sh (__m128h __A, unsigned int __B, const int __R)
+{
+  return __builtin_ia32_vcvtusi2sh32_round (__A, __B, __R);
+}
+
+#else
+#define _mm_cvt_roundi32_sh(A, B, C)           \
+  (__builtin_ia32_vcvtsi2sh32_round ((A), (B), (C)))
+#define _mm_cvt_roundu32_sh(A, B, C)           \
+  (__builtin_ia32_vcvtusi2sh32_round ((A), (B), (C)))
+
+#endif /* __OPTIMIZE__ */
+
+#ifdef __x86_64__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvti64_sh (__m128h __A, long long __B)
+{
+  return __builtin_ia32_vcvtsi2sh64_round (__A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtu64_sh (__m128h __A, unsigned long long __B)
+{
+  return __builtin_ia32_vcvtusi2sh64_round (__A, __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundi64_sh (__m128h __A, long long __B, const int __R)
+{
+  return __builtin_ia32_vcvtsi2sh64_round (__A, __B, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundu64_sh (__m128h __A, unsigned long long __B, const int __R)
+{
+  return __builtin_ia32_vcvtusi2sh64_round (__A, __B, __R);
+}
+
+#else
+#define _mm_cvt_roundi64_sh(A, B, C)           \
+  (__builtin_ia32_vcvtsi2sh64_round ((A), (B), (C)))
+#define _mm_cvt_roundu64_sh(A, B, C)           \
+  (__builtin_ia32_vcvtusi2sh64_round ((A), (B), (C)))
+
+#endif /* __OPTIMIZE__ */
+#endif /* __x86_64__ */
+
+/* Intrinsics vcvtph2pd.  */
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtph_pd (__m128h __A)
+{
+  return __builtin_ia32_vcvtph2pd512_mask_round (__A,
+                                                _mm512_setzero_pd (),
+                                                (__mmask8) -1,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtph_pd (__m512d __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvtph2pd512_mask_round (__C, __A, __B,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtph_pd (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvtph2pd512_mask_round (__B,
+                                                _mm512_setzero_pd (),
+                                                __A,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundph_pd (__m128h __A, int __B)
+{
+  return __builtin_ia32_vcvtph2pd512_mask_round (__A,
+                                                _mm512_setzero_pd (),
+                                                (__mmask8) -1,
+                                                __B);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundph_pd (__m512d __A, __mmask8 __B, __m128h __C, int __D)
+{
+  return __builtin_ia32_vcvtph2pd512_mask_round (__C, __A, __B, __D);
+}
+
+extern __inline __m512d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundph_pd (__mmask8 __A, __m128h __B, int __C)
+{
+  return __builtin_ia32_vcvtph2pd512_mask_round (__B,
+                                                _mm512_setzero_pd (),
+                                                __A,
+                                                __C);
+}
+
+#else
+#define _mm512_cvt_roundph_pd(A, B)                                    \
+  (__builtin_ia32_vcvtph2pd512_mask_round ((A),                        \
+                                          _mm512_setzero_pd (),        \
+                                          (__mmask8)-1,                \
+                                          (B)))
+
+#define _mm512_mask_cvt_roundph_pd(A, B, C, D)                         \
+  (__builtin_ia32_vcvtph2pd512_mask_round ((C), (A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundph_pd(A, B, C)                           \
+  (__builtin_ia32_vcvtph2pd512_mask_round ((B),                        \
+                                          _mm512_setzero_pd (),        \
+                                          (A),                 \
+                                          (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtph2psx.  */
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtxph_ps (__m256h __A)
+{
+  return __builtin_ia32_vcvtph2psx512_mask_round (__A,
+                                                 _mm512_setzero_ps (),
+                                                 (__mmask16) -1,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtxph_ps (__m512 __A, __mmask16 __B, __m256h __C)
+{
+  return __builtin_ia32_vcvtph2psx512_mask_round (__C, __A, __B,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtxph_ps (__mmask16 __A, __m256h __B)
+{
+  return __builtin_ia32_vcvtph2psx512_mask_round (__B,
+                                                 _mm512_setzero_ps (),
+                                                 __A,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtx_roundph_ps (__m256h __A, int __B)
+{
+  return __builtin_ia32_vcvtph2psx512_mask_round (__A,
+                                                 _mm512_setzero_ps (),
+                                                 (__mmask16) -1,
+                                                 __B);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtx_roundph_ps (__m512 __A, __mmask16 __B, __m256h __C, int __D)
+{
+  return __builtin_ia32_vcvtph2psx512_mask_round (__C, __A, __B, __D);
+}
+
+extern __inline __m512
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtx_roundph_ps (__mmask16 __A, __m256h __B, int __C)
+{
+  return __builtin_ia32_vcvtph2psx512_mask_round (__B,
+                                                 _mm512_setzero_ps (),
+                                                 __A,
+                                                 __C);
+}
+
+#else
+#define _mm512_cvtx_roundph_ps(A, B)                                   \
+  (__builtin_ia32_vcvtph2psx512_mask_round ((A),                       \
+                                           _mm512_setzero_ps (),       \
+                                           (__mmask16)-1,              \
+                                           (B)))
+
+#define _mm512_mask_cvtx_roundph_ps(A, B, C, D)                                \
+  (__builtin_ia32_vcvtph2psx512_mask_round ((C), (A), (B), (D)))
+
+#define _mm512_maskz_cvtx_roundph_ps(A, B, C)                          \
+  (__builtin_ia32_vcvtph2psx512_mask_round ((B),                       \
+                                           _mm512_setzero_ps (),       \
+                                           (A),                        \
+                                           (C)))
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtps2ph.  */
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtxps_ph (__m512 __A)
+{
+  return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __A,
+                                                 _mm256_setzero_ph (),
+                                                 (__mmask16) -1,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtxps_ph (__m256h __A, __mmask16 __B, __m512 __C)
+{
+  return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __C,
+                                                 __A, __B,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtxps_ph (__mmask16 __A, __m512 __B)
+{
+  return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __B,
+                                                 _mm256_setzero_ph (),
+                                                 __A,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtx_roundps_ph (__m512 __A, int __B)
+{
+  return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __A,
+                                                 _mm256_setzero_ph (),
+                                                 (__mmask16) -1,
+                                                 __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtx_roundps_ph (__m256h __A, __mmask16 __B, __m512 __C, int __D)
+{
+  return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __C,
+                                                 __A, __B, __D);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtx_roundps_ph (__mmask16 __A, __m512 __B, int __C)
+{
+  return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __B,
+                                                 _mm256_setzero_ph (),
+                                                 __A, __C);
+}
+
+#else
+#define _mm512_cvtx_roundps_ph(A, B)                           \
+  (__builtin_ia32_vcvtps2phx512_mask_round ((__v16sf)(A),      \
+                                           _mm256_setzero_ph (),\
+                                           (__mmask16)-1, (B)))
+
+#define _mm512_mask_cvtx_roundps_ph(A, B, C, D)                        \
+  (__builtin_ia32_vcvtps2phx512_mask_round ((__v16sf)(C),      \
+                                           (A), (B), (D)))
+
+#define _mm512_maskz_cvtx_roundps_ph(A, B, C)                  \
+  (__builtin_ia32_vcvtps2phx512_mask_round ((__v16sf)(B),      \
+                                           _mm256_setzero_ph (),\
+                                           (A), (C)))
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtpd2ph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvtpd_ph (__m512d __A)
+{
+  return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __A,
+                                                _mm_setzero_ph (),
+                                                (__mmask8) -1,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvtpd_ph (__m128h __A, __mmask8 __B, __m512d __C)
+{
+  return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __C,
+                                                __A, __B,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvtpd_ph (__mmask8 __A, __m512d __B)
+{
+  return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __B,
+                                                _mm_setzero_ph (),
+                                                __A,
+                                                _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_cvt_roundpd_ph (__m512d __A, int __B)
+{
+  return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __A,
+                                                _mm_setzero_ph (),
+                                                (__mmask8) -1,
+                                                __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_cvt_roundpd_ph (__m128h __A, __mmask8 __B, __m512d __C, int __D)
+{
+  return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __C,
+                                                __A, __B, __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_cvt_roundpd_ph (__mmask8 __A, __m512d __B, int __C)
+{
+  return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __B,
+                                                _mm_setzero_ph (),
+                                                __A, __C);
+}
+
+#else
+#define _mm512_cvt_roundpd_ph(A, B)                            \
+  (__builtin_ia32_vcvtpd2ph512_mask_round ((__v8df)(A),                \
+                                          _mm_setzero_ph (),   \
+                                          (__mmask8)-1, (B)))
+
+#define _mm512_mask_cvt_roundpd_ph(A, B, C, D)                 \
+  (__builtin_ia32_vcvtpd2ph512_mask_round ((__v8df)(C),                \
+                                          (A), (B), (D)))
+
+#define _mm512_maskz_cvt_roundpd_ph(A, B, C)                   \
+  (__builtin_ia32_vcvtpd2ph512_mask_round ((__v8df)(B),                \
+                                          _mm_setzero_ph (),   \
+                                          (A), (C)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtsh2ss, vcvtsh2sd.  */
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsh_ss (__m128 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvtsh2ss_mask_round (__B, __A,
+                                             _mm_setzero_ps (),
+                                             (__mmask8) -1,
+                                             _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsh_ss (__m128 __A, __mmask8 __B, __m128 __C,
+                        __m128h __D)
+{
+  return __builtin_ia32_vcvtsh2ss_mask_round (__D, __C, __A, __B,
+                                             _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtsh_ss (__mmask8 __A, __m128 __B,
+                         __m128h __C)
+{
+  return __builtin_ia32_vcvtsh2ss_mask_round (__C, __B,
+                                             _mm_setzero_ps (),
+                                             __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsh_sd (__m128d __A, __m128h __B)
+{
+  return __builtin_ia32_vcvtsh2sd_mask_round (__B, __A,
+                                             _mm_setzero_pd (),
+                                             (__mmask8) -1,
+                                             _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsh_sd (__m128d __A, __mmask8 __B, __m128d __C,
+                        __m128h __D)
+{
+  return __builtin_ia32_vcvtsh2sd_mask_round (__D, __C, __A, __B,
+                                             _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtsh_sd (__mmask8 __A, __m128d __B, __m128h __C)
+{
+  return __builtin_ia32_vcvtsh2sd_mask_round (__C, __B,
+                                             _mm_setzero_pd (),
+                                             __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsh_ss (__m128 __A, __m128h __B, const int __R)
+{
+  return __builtin_ia32_vcvtsh2ss_mask_round (__B, __A,
+                                             _mm_setzero_ps (),
+                                             (__mmask8) -1, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvt_roundsh_ss (__m128 __A, __mmask8 __B, __m128 __C,
+                        __m128h __D, const int __R)
+{
+  return __builtin_ia32_vcvtsh2ss_mask_round (__D, __C, __A, __B, __R);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvt_roundsh_ss (__mmask8 __A, __m128 __B,
+                         __m128h __C, const int __R)
+{
+  return __builtin_ia32_vcvtsh2ss_mask_round (__C, __B,
+                                             _mm_setzero_ps (),
+                                             __A, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsh_sd (__m128d __A, __m128h __B, const int __R)
+{
+  return __builtin_ia32_vcvtsh2sd_mask_round (__B, __A,
+                                             _mm_setzero_pd (),
+                                             (__mmask8) -1, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvt_roundsh_sd (__m128d __A, __mmask8 __B, __m128d __C,
+                        __m128h __D, const int __R)
+{
+  return __builtin_ia32_vcvtsh2sd_mask_round (__D, __C, __A, __B, __R);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvt_roundsh_sd (__mmask8 __A, __m128d __B, __m128h __C, const int __R)
+{
+  return __builtin_ia32_vcvtsh2sd_mask_round (__C, __B,
+                                             _mm_setzero_pd (),
+                                             __A, __R);
+}
+
+#else
+#define _mm_cvt_roundsh_ss(A, B, R)                            \
+  (__builtin_ia32_vcvtsh2ss_mask_round ((B), (A),              \
+                                       _mm_setzero_ps (),      \
+                                       (__mmask8) -1, (R)))
+
+#define _mm_mask_cvt_roundsh_ss(A, B, C, D, R)                         \
+  (__builtin_ia32_vcvtsh2ss_mask_round ((D), (C), (A), (B), (R)))
+
+#define _mm_maskz_cvt_roundsh_ss(A, B, C, R)                   \
+  (__builtin_ia32_vcvtsh2ss_mask_round ((C), (B),              \
+                                       _mm_setzero_ps (),      \
+                                       (A), (R)))
+
+#define _mm_cvt_roundsh_sd(A, B, R)                            \
+  (__builtin_ia32_vcvtsh2sd_mask_round ((B), (A),              \
+                                       _mm_setzero_pd (),      \
+                                       (__mmask8) -1, (R)))
+
+#define _mm_mask_cvt_roundsh_sd(A, B, C, D, R)                         \
+  (__builtin_ia32_vcvtsh2sd_mask_round ((D), (C), (A), (B), (R)))
+
+#define _mm_maskz_cvt_roundsh_sd(A, B, C, R)                   \
+  (__builtin_ia32_vcvtsh2sd_mask_round ((C), (B),              \
+                                       _mm_setzero_pd (),      \
+                                       (A), (R)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtss2sh, vcvtsd2sh.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_sh (__m128h __A, __m128 __B)
+{
+  return __builtin_ia32_vcvtss2sh_mask_round (__B, __A,
+                                             _mm_setzero_ph (),
+                                             (__mmask8) -1,
+                                             _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtss_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128 __D)
+{
+  return __builtin_ia32_vcvtss2sh_mask_round (__D, __C, __A, __B,
+                                             _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtss_sh (__mmask8 __A, __m128h __B, __m128 __C)
+{
+  return __builtin_ia32_vcvtss2sh_mask_round (__C, __B,
+                                             _mm_setzero_ph (),
+                                             __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_sh (__m128h __A, __m128d __B)
+{
+  return __builtin_ia32_vcvtsd2sh_mask_round (__B, __A,
+                                             _mm_setzero_ph (),
+                                             (__mmask8) -1,
+                                             _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsd_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128d __D)
+{
+  return __builtin_ia32_vcvtsd2sh_mask_round (__D, __C, __A, __B,
+                                             _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtsd_sh (__mmask8 __A, __m128h __B, __m128d __C)
+{
+  return __builtin_ia32_vcvtsd2sh_mask_round (__C, __B,
+                                             _mm_setzero_ph (),
+                                             __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundss_sh (__m128h __A, __m128 __B, const int __R)
+{
+  return __builtin_ia32_vcvtss2sh_mask_round (__B, __A,
+                                             _mm_setzero_ph (),
+                                             (__mmask8) -1, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvt_roundss_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128 __D,
+                        const int __R)
+{
+  return __builtin_ia32_vcvtss2sh_mask_round (__D, __C, __A, __B, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvt_roundss_sh (__mmask8 __A, __m128h __B, __m128 __C,
+                         const int __R)
+{
+  return __builtin_ia32_vcvtss2sh_mask_round (__C, __B,
+                                             _mm_setzero_ph (),
+                                             __A, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_roundsd_sh (__m128h __A, __m128d __B, const int __R)
+{
+  return __builtin_ia32_vcvtsd2sh_mask_round (__B, __A,
+                                             _mm_setzero_ph (),
+                                             (__mmask8) -1, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvt_roundsd_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128d __D,
+                        const int __R)
+{
+  return __builtin_ia32_vcvtsd2sh_mask_round (__D, __C, __A, __B, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvt_roundsd_sh (__mmask8 __A, __m128h __B, __m128d __C,
+                         const int __R)
+{
+  return __builtin_ia32_vcvtsd2sh_mask_round (__C, __B,
+                                             _mm_setzero_ph (),
+                                             __A, __R);
+}
+
+#else
+#define _mm_cvt_roundss_sh(A, B, R)                            \
+  (__builtin_ia32_vcvtss2sh_mask_round ((B), (A),              \
+                                       _mm_setzero_ph (),      \
+                                       (__mmask8) -1, R))
+
+#define _mm_mask_cvt_roundss_sh(A, B, C, D, R)                         \
+  (__builtin_ia32_vcvtss2sh_mask_round ((D), (C), (A), (B), (R)))
+
+#define _mm_maskz_cvt_roundss_sh(A, B, C, R)                   \
+  (__builtin_ia32_vcvtss2sh_mask_round ((C), (B),              \
+                                       _mm_setzero_ph (),      \
+                                       A, R))
+
+#define _mm_cvt_roundsd_sh(A, B, R)                            \
+  (__builtin_ia32_vcvtsd2sh_mask_round ((B), (A),              \
+                                       _mm_setzero_ph (),      \
+                                       (__mmask8) -1, R))
+
+#define _mm_mask_cvt_roundsd_sh(A, B, C, D, R)                         \
+  (__builtin_ia32_vcvtsd2sh_mask_round ((D), (C), (A), (B), (R)))
+
+#define _mm_maskz_cvt_roundsd_sh(A, B, C, R)                   \
+  (__builtin_ia32_vcvtsd2sh_mask_round ((C), (B),              \
+                                       _mm_setzero_ph (),      \
+                                       (A), (R)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vfmaddsub[132,213,231]ph.  */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmaddsub_ph (__m512h __A, __m512h __B, __m512h __C)
+{
+  return (__m512h)
+    __builtin_ia32_vfmaddsubph512_mask ((__v32hf) __A,
+                                       (__v32hf) __B,
+                                       (__v32hf) __C,
+                                       (__mmask32) -1,
+                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmaddsub_ph (__m512h __A, __mmask32 __U, __m512h __B, __m512h __C)
+{
+  return (__m512h)
+    __builtin_ia32_vfmaddsubph512_mask ((__v32hf) __A,
+                                       (__v32hf) __B,
+                                       (__v32hf) __C,
+                                       (__mmask32) __U,
+                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmaddsub_ph (__m512h __A, __m512h __B, __m512h __C, __mmask32 __U)
+{
+  return (__m512h)
+    __builtin_ia32_vfmaddsubph512_mask3 ((__v32hf) __A,
+                                        (__v32hf) __B,
+                                        (__v32hf) __C,
+                                        (__mmask32) __U,
+                                        _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmaddsub_ph (__mmask32 __U, __m512h __A, __m512h __B, __m512h __C)
+{
+  return (__m512h)
+    __builtin_ia32_vfmaddsubph512_maskz ((__v32hf) __A,
+                                        (__v32hf) __B,
+                                        (__v32hf) __C,
+                                        (__mmask32) __U,
+                                        _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmaddsub_round_ph (__m512h __A, __m512h __B, __m512h __C, const int __R)
+{
+  return (__m512h)
+    __builtin_ia32_vfmaddsubph512_mask ((__v32hf) __A,
+                                       (__v32hf) __B,
+                                       (__v32hf) __C,
+                                       (__mmask32) -1, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmaddsub_round_ph (__m512h __A, __mmask32 __U, __m512h __B,
+                              __m512h __C, const int __R)
+{
+  return (__m512h)
+    __builtin_ia32_vfmaddsubph512_mask ((__v32hf) __A,
+                                       (__v32hf) __B,
+                                       (__v32hf) __C,
+                                       (__mmask32) __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmaddsub_round_ph (__m512h __A, __m512h __B, __m512h __C,
+                               __mmask32 __U, const int __R)
+{
+  return (__m512h)
+    __builtin_ia32_vfmaddsubph512_mask3 ((__v32hf) __A,
+                                        (__v32hf) __B,
+                                        (__v32hf) __C,
+                                        (__mmask32) __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmaddsub_round_ph (__mmask32 __U, __m512h __A, __m512h __B,
+                               __m512h __C, const int __R)
+{
+  return (__m512h)
+    __builtin_ia32_vfmaddsubph512_maskz ((__v32hf) __A,
+                                        (__v32hf) __B,
+                                        (__v32hf) __C,
+                                        (__mmask32) __U, __R);
+}
+
+#else
+#define _mm512_fmaddsub_round_ph(A, B, C, R)                           \
+  ((__m512h)__builtin_ia32_vfmaddsubph512_mask ((A), (B), (C), -1, (R)))
+
+#define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R)                   \
+  ((__m512h)__builtin_ia32_vfmaddsubph512_mask ((A), (B), (C), (U), (R)))
+
+#define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R)                  \
+  ((__m512h)__builtin_ia32_vfmaddsubph512_mask3 ((A), (B), (C), (U), (R)))
+
+#define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R)                  \
+  ((__m512h)__builtin_ia32_vfmaddsubph512_maskz ((A), (B), (C), (U), (R)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vfmsubadd[132,213,231]ph.  */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+  _mm512_fmsubadd_ph (__m512h __A, __m512h __B, __m512h __C)
+{
+  return (__m512h)
+    __builtin_ia32_vfmsubaddph512_mask ((__v32hf) __A,
+                                       (__v32hf) __B,
+                                       (__v32hf) __C,
+                                       (__mmask32) -1,
+                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmsubadd_ph (__m512h __A, __mmask32 __U,
+                        __m512h __B, __m512h __C)
+{
+  return (__m512h)
+    __builtin_ia32_vfmsubaddph512_mask ((__v32hf) __A,
+                                       (__v32hf) __B,
+                                       (__v32hf) __C,
+                                       (__mmask32) __U,
+                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmsubadd_ph (__m512h __A, __m512h __B,
+                         __m512h __C, __mmask32 __U)
+{
+  return (__m512h)
+    __builtin_ia32_vfmsubaddph512_mask3 ((__v32hf) __A,
+                                        (__v32hf) __B,
+                                        (__v32hf) __C,
+                                        (__mmask32) __U,
+                                        _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmsubadd_ph (__mmask32 __U, __m512h __A,
+                         __m512h __B, __m512h __C)
+{
+  return (__m512h)
+    __builtin_ia32_vfmsubaddph512_maskz ((__v32hf) __A,
+                                        (__v32hf) __B,
+                                        (__v32hf) __C,
+                                        (__mmask32) __U,
+                                        _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmsubadd_round_ph (__m512h __A, __m512h __B,
+                         __m512h __C, const int __R)
+{
+  return (__m512h)
+    __builtin_ia32_vfmsubaddph512_mask ((__v32hf) __A,
+                                       (__v32hf) __B,
+                                       (__v32hf) __C,
+                                       (__mmask32) -1, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmsubadd_round_ph (__m512h __A, __mmask32 __U, __m512h __B,
+                              __m512h __C, const int __R)
+{
+  return (__m512h)
+    __builtin_ia32_vfmsubaddph512_mask ((__v32hf) __A,
+                                       (__v32hf) __B,
+                                       (__v32hf) __C,
+                                       (__mmask32) __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmsubadd_round_ph (__m512h __A, __m512h __B, __m512h __C,
+                               __mmask32 __U, const int __R)
+{
+  return (__m512h)
+    __builtin_ia32_vfmsubaddph512_mask3 ((__v32hf) __A,
+                                        (__v32hf) __B,
+                                        (__v32hf) __C,
+                                        (__mmask32) __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmsubadd_round_ph (__mmask32 __U, __m512h __A, __m512h __B,
+                               __m512h __C, const int __R)
+{
+  return (__m512h)
+    __builtin_ia32_vfmsubaddph512_maskz ((__v32hf) __A,
+                                        (__v32hf) __B,
+                                        (__v32hf) __C,
+                                        (__mmask32) __U, __R);
+}
+
+#else
+#define _mm512_fmsubadd_round_ph(A, B, C, R)                           \
+  ((__m512h)__builtin_ia32_vfmsubaddph512_mask ((A), (B), (C), -1, (R)))
+
+#define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R)                   \
+  ((__m512h)__builtin_ia32_vfmsubaddph512_mask ((A), (B), (C), (U), (R)))
+
+#define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R)                  \
+  ((__m512h)__builtin_ia32_vfmsubaddph512_mask3 ((A), (B), (C), (U), (R)))
+
+#define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R)                  \
+  ((__m512h)__builtin_ia32_vfmsubaddph512_maskz ((A), (B), (C), (U), (R)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vfmadd[132,213,231]ph.  */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+  _mm512_fmadd_ph (__m512h __A, __m512h __B, __m512h __C)
+{
+  return (__m512h)
+    __builtin_ia32_vfmaddph512_mask ((__v32hf) __A,
+                                    (__v32hf) __B,
+                                    (__v32hf) __C,
+                                    (__mmask32) -1,
+                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmadd_ph (__m512h __A, __mmask32 __U, __m512h __B, __m512h __C)
+{
+  return (__m512h)
+    __builtin_ia32_vfmaddph512_mask ((__v32hf) __A,
+                                    (__v32hf) __B,
+                                    (__v32hf) __C,
+                                    (__mmask32) __U,
+                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmadd_ph (__m512h __A, __m512h __B, __m512h __C, __mmask32 __U)
+{
+  return (__m512h)
+    __builtin_ia32_vfmaddph512_mask3 ((__v32hf) __A,
+                                     (__v32hf) __B,
+                                     (__v32hf) __C,
+                                     (__mmask32) __U,
+                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmadd_ph (__mmask32 __U, __m512h __A, __m512h __B, __m512h __C)
+{
+  return (__m512h)
+    __builtin_ia32_vfmaddph512_maskz ((__v32hf) __A,
+                                     (__v32hf) __B,
+                                     (__v32hf) __C,
+                                     (__mmask32) __U,
+                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmadd_round_ph (__m512h __A, __m512h __B, __m512h __C, const int __R)
+{
+  return (__m512h) __builtin_ia32_vfmaddph512_mask ((__v32hf) __A,
+                                                      (__v32hf) __B,
+                                                      (__v32hf) __C,
+                                                      (__mmask32) -1, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmadd_round_ph (__m512h __A, __mmask32 __U, __m512h __B,
+                              __m512h __C, const int __R)
+{
+  return (__m512h) __builtin_ia32_vfmaddph512_mask ((__v32hf) __A,
+                                                      (__v32hf) __B,
+                                                      (__v32hf) __C,
+                                                      (__mmask32) __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmadd_round_ph (__m512h __A, __m512h __B, __m512h __C,
+                               __mmask32 __U, const int __R)
+{
+  return (__m512h) __builtin_ia32_vfmaddph512_mask3 ((__v32hf) __A,
+                                                       (__v32hf) __B,
+                                                       (__v32hf) __C,
+                                                       (__mmask32) __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmadd_round_ph (__mmask32 __U, __m512h __A, __m512h __B,
+                               __m512h __C, const int __R)
+{
+  return (__m512h) __builtin_ia32_vfmaddph512_maskz ((__v32hf) __A,
+                                                       (__v32hf) __B,
+                                                       (__v32hf) __C,
+                                                       (__mmask32) __U, __R);
+}
+
+#else
+#define _mm512_fmadd_round_ph(A, B, C, R)                              \
+  ((__m512h)__builtin_ia32_vfmaddph512_mask ((A), (B), (C), -1, (R)))
+
+#define _mm512_mask_fmadd_round_ph(A, U, B, C, R)                      \
+  ((__m512h)__builtin_ia32_vfmaddph512_mask ((A), (B), (C), (U), (R)))
+
+#define _mm512_mask3_fmadd_round_ph(A, B, C, U, R)                     \
+  ((__m512h)__builtin_ia32_vfmaddph512_mask3 ((A), (B), (C), (U), (R)))
+
+#define _mm512_maskz_fmadd_round_ph(U, A, B, C, R)                     \
+  ((__m512h)__builtin_ia32_vfmaddph512_maskz ((A), (B), (C), (U), (R)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vfnmadd[132,213,231]ph.  */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fnmadd_ph (__m512h __A, __m512h __B, __m512h __C)
+{
+  return (__m512h)
+    __builtin_ia32_vfnmaddph512_mask ((__v32hf) __A,
+                                     (__v32hf) __B,
+                                     (__v32hf) __C,
+                                     (__mmask32) -1,
+                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fnmadd_ph (__m512h __A, __mmask32 __U, __m512h __B, __m512h __C)
+{
+  return (__m512h)
+    __builtin_ia32_vfnmaddph512_mask ((__v32hf) __A,
+                                     (__v32hf) __B,
+                                     (__v32hf) __C,
+                                     (__mmask32) __U,
+                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fnmadd_ph (__m512h __A, __m512h __B, __m512h __C, __mmask32 __U)
+{
+  return (__m512h)
+    __builtin_ia32_vfnmaddph512_mask3 ((__v32hf) __A,
+                                      (__v32hf) __B,
+                                      (__v32hf) __C,
+                                      (__mmask32) __U,
+                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fnmadd_ph (__mmask32 __U, __m512h __A, __m512h __B, __m512h __C)
+{
+  return (__m512h)
+    __builtin_ia32_vfnmaddph512_maskz ((__v32hf) __A,
+                                      (__v32hf) __B,
+                                      (__v32hf) __C,
+                                      (__mmask32) __U,
+                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fnmadd_round_ph (__m512h __A, __m512h __B, __m512h __C, const int __R)
+{
+  return (__m512h) __builtin_ia32_vfnmaddph512_mask ((__v32hf) __A,
+                                                      (__v32hf) __B,
+                                                      (__v32hf) __C,
+                                                      (__mmask32) -1, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fnmadd_round_ph (__m512h __A, __mmask32 __U, __m512h __B,
+                              __m512h __C, const int __R)
+{
+  return (__m512h) __builtin_ia32_vfnmaddph512_mask ((__v32hf) __A,
+                                                      (__v32hf) __B,
+                                                      (__v32hf) __C,
+                                                      (__mmask32) __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fnmadd_round_ph (__m512h __A, __m512h __B, __m512h __C,
+                               __mmask32 __U, const int __R)
+{
+  return (__m512h) __builtin_ia32_vfnmaddph512_mask3 ((__v32hf) __A,
+                                                       (__v32hf) __B,
+                                                       (__v32hf) __C,
+                                                       (__mmask32) __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fnmadd_round_ph (__mmask32 __U, __m512h __A, __m512h __B,
+                               __m512h __C, const int __R)
+{
+  return (__m512h) __builtin_ia32_vfnmaddph512_maskz ((__v32hf) __A,
+                                                       (__v32hf) __B,
+                                                       (__v32hf) __C,
+                                                       (__mmask32) __U, __R);
+}
+
+#else
+#define _mm512_fnmadd_round_ph(A, B, C, R)                             \
+  ((__m512h)__builtin_ia32_vfnmaddph512_mask ((A), (B), (C), -1, (R)))
+
+#define _mm512_mask_fnmadd_round_ph(A, U, B, C, R)                     \
+  ((__m512h)__builtin_ia32_vfnmaddph512_mask ((A), (B), (C), (U), (R)))
+
+#define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R)                    \
+  ((__m512h)__builtin_ia32_vfnmaddph512_mask3 ((A), (B), (C), (U), (R)))
+
+#define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R)                    \
+  ((__m512h)__builtin_ia32_vfnmaddph512_maskz ((A), (B), (C), (U), (R)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vfmsub[132,213,231]ph.  */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmsub_ph (__m512h __A, __m512h __B, __m512h __C)
+{
+  return (__m512h)
+    __builtin_ia32_vfmsubph512_mask ((__v32hf) __A,
+                                    (__v32hf) __B,
+                                    (__v32hf) __C,
+                                    (__mmask32) -1,
+                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmsub_ph (__m512h __A, __mmask32 __U, __m512h __B, __m512h __C)
+{
+  return (__m512h)
+    __builtin_ia32_vfmsubph512_mask ((__v32hf) __A,
+                                    (__v32hf) __B,
+                                    (__v32hf) __C,
+                                    (__mmask32) __U,
+                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmsub_ph (__m512h __A, __m512h __B, __m512h __C, __mmask32 __U)
+{
+  return (__m512h)
+    __builtin_ia32_vfmsubph512_mask3 ((__v32hf) __A,
+                                     (__v32hf) __B,
+                                     (__v32hf) __C,
+                                     (__mmask32) __U,
+                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmsub_ph (__mmask32 __U, __m512h __A, __m512h __B, __m512h __C)
+{
+  return (__m512h)
+    __builtin_ia32_vfmsubph512_maskz ((__v32hf) __A,
+                                     (__v32hf) __B,
+                                     (__v32hf) __C,
+                                     (__mmask32) __U,
+                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmsub_round_ph (__m512h __A, __m512h __B, __m512h __C, const int __R)
+{
+  return (__m512h) __builtin_ia32_vfmsubph512_mask ((__v32hf) __A,
+                                                      (__v32hf) __B,
+                                                      (__v32hf) __C,
+                                                      (__mmask32) -1, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmsub_round_ph (__m512h __A, __mmask32 __U, __m512h __B,
+                              __m512h __C, const int __R)
+{
+  return (__m512h) __builtin_ia32_vfmsubph512_mask ((__v32hf) __A,
+                                                      (__v32hf) __B,
+                                                      (__v32hf) __C,
+                                                      (__mmask32) __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmsub_round_ph (__m512h __A, __m512h __B, __m512h __C,
+                               __mmask32 __U, const int __R)
+{
+  return (__m512h) __builtin_ia32_vfmsubph512_mask3 ((__v32hf) __A,
+                                                       (__v32hf) __B,
+                                                       (__v32hf) __C,
+                                                       (__mmask32) __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmsub_round_ph (__mmask32 __U, __m512h __A, __m512h __B,
+                               __m512h __C, const int __R)
+{
+  return (__m512h) __builtin_ia32_vfmsubph512_maskz ((__v32hf) __A,
+                                                       (__v32hf) __B,
+                                                       (__v32hf) __C,
+                                                       (__mmask32) __U, __R);
+}
+
+#else
+#define _mm512_fmsub_round_ph(A, B, C, R)                              \
+  ((__m512h)__builtin_ia32_vfmsubph512_mask ((A), (B), (C), -1, (R)))
+
+#define _mm512_mask_fmsub_round_ph(A, U, B, C, R)                      \
+  ((__m512h)__builtin_ia32_vfmsubph512_mask ((A), (B), (C), (U), (R)))
+
+#define _mm512_mask3_fmsub_round_ph(A, B, C, U, R)                     \
+  ((__m512h)__builtin_ia32_vfmsubph512_mask3 ((A), (B), (C), (U), (R)))
+
+#define _mm512_maskz_fmsub_round_ph(U, A, B, C, R)                     \
+  ((__m512h)__builtin_ia32_vfmsubph512_maskz ((A), (B), (C), (U), (R)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vfnmsub[132,213,231]ph.  */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fnmsub_ph (__m512h __A, __m512h __B, __m512h __C)
+{
+  return (__m512h)
+    __builtin_ia32_vfnmsubph512_mask ((__v32hf) __A,
+                                     (__v32hf) __B,
+                                     (__v32hf) __C,
+                                     (__mmask32) -1,
+                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fnmsub_ph (__m512h __A, __mmask32 __U, __m512h __B, __m512h __C)
+{
+  return (__m512h)
+    __builtin_ia32_vfnmsubph512_mask ((__v32hf) __A,
+                                     (__v32hf) __B,
+                                     (__v32hf) __C,
+                                     (__mmask32) __U,
+                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fnmsub_ph (__m512h __A, __m512h __B, __m512h __C, __mmask32 __U)
+{
+  return (__m512h)
+    __builtin_ia32_vfnmsubph512_mask3 ((__v32hf) __A,
+                                      (__v32hf) __B,
+                                      (__v32hf) __C,
+                                      (__mmask32) __U,
+                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fnmsub_ph (__mmask32 __U, __m512h __A, __m512h __B, __m512h __C)
+{
+  return (__m512h)
+    __builtin_ia32_vfnmsubph512_maskz ((__v32hf) __A,
+                                      (__v32hf) __B,
+                                      (__v32hf) __C,
+                                      (__mmask32) __U,
+                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fnmsub_round_ph (__m512h __A, __m512h __B, __m512h __C, const int __R)
+{
+  return (__m512h) __builtin_ia32_vfnmsubph512_mask ((__v32hf) __A,
+                                                      (__v32hf) __B,
+                                                      (__v32hf) __C,
+                                                      (__mmask32) -1, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fnmsub_round_ph (__m512h __A, __mmask32 __U, __m512h __B,
+                              __m512h __C, const int __R)
+{
+  return (__m512h) __builtin_ia32_vfnmsubph512_mask ((__v32hf) __A,
+                                                      (__v32hf) __B,
+                                                      (__v32hf) __C,
+                                                      (__mmask32) __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fnmsub_round_ph (__m512h __A, __m512h __B, __m512h __C,
+                               __mmask32 __U, const int __R)
+{
+  return (__m512h) __builtin_ia32_vfnmsubph512_mask3 ((__v32hf) __A,
+                                                       (__v32hf) __B,
+                                                       (__v32hf) __C,
+                                                       (__mmask32) __U, __R);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fnmsub_round_ph (__mmask32 __U, __m512h __A, __m512h __B,
+                               __m512h __C, const int __R)
+{
+  return (__m512h) __builtin_ia32_vfnmsubph512_maskz ((__v32hf) __A,
+                                                       (__v32hf) __B,
+                                                       (__v32hf) __C,
+                                                       (__mmask32) __U, __R);
+}
+
+#else
+#define _mm512_fnmsub_round_ph(A, B, C, R)                             \
+  ((__m512h)__builtin_ia32_vfnmsubph512_mask ((A), (B), (C), -1, (R)))
+
+#define _mm512_mask_fnmsub_round_ph(A, U, B, C, R)                     \
+  ((__m512h)__builtin_ia32_vfnmsubph512_mask ((A), (B), (C), (U), (R)))
+
+#define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R)                    \
+  ((__m512h)__builtin_ia32_vfnmsubph512_mask3 ((A), (B), (C), (U), (R)))
+
+#define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R)                    \
+  ((__m512h)__builtin_ia32_vfnmsubph512_maskz ((A), (B), (C), (U), (R)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vfmadd[132,213,231]sh.  */
+extern __inline __m128h
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_sh (__m128h __W, __m128h __A, __m128h __B)
+{
+  return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W,
+                                                 (__v8hf) __A,
+                                                 (__v8hf) __B,
+                                                 (__mmask8) -1,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmadd_sh (__m128h __W, __mmask8 __U, __m128h __A, __m128h __B)
+{
+  return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W,
+                                                 (__v8hf) __A,
+                                                 (__v8hf) __B,
+                                                 (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmadd_sh (__m128h __W, __m128h __A, __m128h __B, __mmask8 __U)
+{
+  return (__m128h) __builtin_ia32_vfmaddsh3_mask3 ((__v8hf) __W,
+                                                  (__v8hf) __A,
+                                                  (__v8hf) __B,
+                                                  (__mmask8) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmadd_sh (__mmask8 __U, __m128h __W, __m128h __A, __m128h __B)
+{
+  return (__m128h) __builtin_ia32_vfmaddsh3_maskz ((__v8hf) __W,
+                                                  (__v8hf) __A,
+                                                  (__v8hf) __B,
+                                                  (__mmask8) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_round_sh (__m128h __W, __m128h __A, __m128h __B, const int __R)
+{
+  return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W,
+                                                 (__v8hf) __A,
+                                                 (__v8hf) __B,
+                                                 (__mmask8) -1,
+                                                 __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmadd_round_sh (__m128h __W, __mmask8 __U, __m128h __A, __m128h __B,
+                        const int __R)
+{
+  return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W,
+                                                 (__v8hf) __A,
+                                                 (__v8hf) __B,
+                                                 (__mmask8) __U, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmadd_round_sh (__m128h __W, __m128h __A, __m128h __B, __mmask8 __U,
+                         const int __R)
+{
+  return (__m128h) __builtin_ia32_vfmaddsh3_mask3 ((__v8hf) __W,
+                                                  (__v8hf) __A,
+                                                  (__v8hf) __B,
+                                                  (__mmask8) __U, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmadd_round_sh (__mmask8 __U, __m128h __W, __m128h __A,
+                         __m128h __B, const int __R)
+{
+  return (__m128h) __builtin_ia32_vfmaddsh3_maskz ((__v8hf) __W,
+                                                  (__v8hf) __A,
+                                                  (__v8hf) __B,
+                                                  (__mmask8) __U, __R);
+}
+
+#else
+#define _mm_fmadd_round_sh(A, B, C, R)                                 \
+  ((__m128h) __builtin_ia32_vfmaddsh3_mask ((A), (B), (C), (-1), (R)))
+#define _mm_mask_fmadd_round_sh(A, U, B, C, R)                         \
+  ((__m128h) __builtin_ia32_vfmaddsh3_mask ((A), (B), (C), (U), (R)))
+#define _mm_mask3_fmadd_round_sh(A, B, C, U, R)                                \
+  ((__m128h) __builtin_ia32_vfmaddsh3_mask3 ((A), (B), (C), (U), (R)))
+#define _mm_maskz_fmadd_round_sh(U, A, B, C, R)                                \
+  ((__m128h) __builtin_ia32_vfmaddsh3_maskz ((A), (B), (C), (U), (R)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vfnmadd[132,213,231]sh.  */
+extern __inline __m128h
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmadd_sh (__m128h __W, __m128h __A, __m128h __B)
+{
+  return (__m128h) __builtin_ia32_vfnmaddsh3_mask ((__v8hf) __W,
+                                                  (__v8hf) __A,
+                                                  (__v8hf) __B,
+                                                  (__mmask8) -1,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmadd_sh (__m128h __W, __mmask8 __U, __m128h __A, __m128h __B)
+{
+  return (__m128h) __builtin_ia32_vfnmaddsh3_mask ((__v8hf) __W,
+                                                 (__v8hf) __A,
+                                                 (__v8hf) __B,
+                                                 (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmadd_sh (__m128h __W, __m128h __A, __m128h __B, __mmask8 __U)
+{
+  return (__m128h) __builtin_ia32_vfnmaddsh3_mask3 ((__v8hf) __W,
+                                                  (__v8hf) __A,
+                                                  (__v8hf) __B,
+                                                  (__mmask8) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmadd_sh (__mmask8 __U, __m128h __W, __m128h __A, __m128h __B)
+{
+  return (__m128h) __builtin_ia32_vfnmaddsh3_maskz ((__v8hf) __W,
+                                                  (__v8hf) __A,
+                                                  (__v8hf) __B,
+                                                  (__mmask8) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmadd_round_sh (__m128h __W, __m128h __A, __m128h __B, const int __R)
+{
+  return (__m128h) __builtin_ia32_vfnmaddsh3_mask ((__v8hf) __W,
+                                                  (__v8hf) __A,
+                                                  (__v8hf) __B,
+                                                  (__mmask8) -1,
+                                                  __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmadd_round_sh (__m128h __W, __mmask8 __U, __m128h __A, __m128h __B,
+                        const int __R)
+{
+  return (__m128h) __builtin_ia32_vfnmaddsh3_mask ((__v8hf) __W,
+                                                 (__v8hf) __A,
+                                                 (__v8hf) __B,
+                                                 (__mmask8) __U, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmadd_round_sh (__m128h __W, __m128h __A, __m128h __B, __mmask8 __U,
+                         const int __R)
+{
+  return (__m128h) __builtin_ia32_vfnmaddsh3_mask3 ((__v8hf) __W,
+                                                  (__v8hf) __A,
+                                                  (__v8hf) __B,
+                                                  (__mmask8) __U, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmadd_round_sh (__mmask8 __U, __m128h __W, __m128h __A,
+                         __m128h __B, const int __R)
+{
+  return (__m128h) __builtin_ia32_vfnmaddsh3_maskz ((__v8hf) __W,
+                                                  (__v8hf) __A,
+                                                  (__v8hf) __B,
+                                                  (__mmask8) __U, __R);
+}
+
+#else
+#define _mm_fnmadd_round_sh(A, B, C, R)                                        \
+  ((__m128h) __builtin_ia32_vfnmaddsh3_mask ((A), (B), (C), (-1), (R)))
+#define _mm_mask_fnmadd_round_sh(A, U, B, C, R)                                \
+  ((__m128h) __builtin_ia32_vfnmaddsh3_mask ((A), (B), (C), (U), (R)))
+#define _mm_mask3_fnmadd_round_sh(A, B, C, U, R)                       \
+  ((__m128h) __builtin_ia32_vfnmaddsh3_mask3 ((A), (B), (C), (U), (R)))
+#define _mm_maskz_fnmadd_round_sh(U, A, B, C, R)                       \
+  ((__m128h) __builtin_ia32_vfnmaddsh3_maskz ((A), (B), (C), (U), (R)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vfmsub[132,213,231]sh.  */
+extern __inline __m128h
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmsub_sh (__m128h __W, __m128h __A, __m128h __B)
+{
+  return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W,
+                                                 (__v8hf) __A,
+                                                 -(__v8hf) __B,
+                                                 (__mmask8) -1,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmsub_sh (__m128h __W, __mmask8 __U, __m128h __A, __m128h __B)
+{
+  return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W,
+                                                 (__v8hf) __A,
+                                                 -(__v8hf) __B,
+                                                 (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmsub_sh (__m128h __W, __m128h __A, __m128h __B, __mmask8 __U)
+{
+  return (__m128h) __builtin_ia32_vfmsubsh3_mask3 ((__v8hf) __W,
+                                                  (__v8hf) __A,
+                                                  (__v8hf) __B,
+                                                  (__mmask8) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmsub_sh (__mmask8 __U, __m128h __W, __m128h __A, __m128h __B)
+{
+  return (__m128h) __builtin_ia32_vfmaddsh3_maskz ((__v8hf) __W,
+                                                  (__v8hf) __A,
+                                                  -(__v8hf) __B,
+                                                  (__mmask8) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmsub_round_sh (__m128h __W, __m128h __A, __m128h __B, const int __R)
+{
+  return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W,
+                                                 (__v8hf) __A,
+                                                 -(__v8hf) __B,
+                                                 (__mmask8) -1,
+                                                 __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmsub_round_sh (__m128h __W, __mmask8 __U, __m128h __A, __m128h __B,
+                        const int __R)
+{
+  return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W,
+                                                 (__v8hf) __A,
+                                                 -(__v8hf) __B,
+                                                 (__mmask8) __U, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmsub_round_sh (__m128h __W, __m128h __A, __m128h __B, __mmask8 __U,
+                         const int __R)
+{
+  return (__m128h) __builtin_ia32_vfmsubsh3_mask3 ((__v8hf) __W,
+                                                  (__v8hf) __A,
+                                                  (__v8hf) __B,
+                                                  (__mmask8) __U, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmsub_round_sh (__mmask8 __U, __m128h __W, __m128h __A,
+                         __m128h __B, const int __R)
+{
+  return (__m128h) __builtin_ia32_vfmaddsh3_maskz ((__v8hf) __W,
+                                                  (__v8hf) __A,
+                                                  -(__v8hf) __B,
+                                                  (__mmask8) __U, __R);
+}
+
+#else
+#define _mm_fmsub_round_sh(A, B, C, R)                                 \
+  ((__m128h) __builtin_ia32_vfmaddsh3_mask ((A), (B), -(C), (-1), (R)))
+#define _mm_mask_fmsub_round_sh(A, U, B, C, R)                         \
+  ((__m128h) __builtin_ia32_vfmaddsh3_mask ((A), (B), -(C), (U), (R)))
+#define _mm_mask3_fmsub_round_sh(A, B, C, U, R)                                \
+  ((__m128h) __builtin_ia32_vfmsubsh3_mask3 ((A), (B), (C), (U), (R)))
+#define _mm_maskz_fmsub_round_sh(U, A, B, C, R)                                \
+  ((__m128h) __builtin_ia32_vfmaddsh3_maskz ((A), (B), -(C), (U), (R)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vfnmsub[132,213,231]sh.  */
+extern __inline __m128h
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmsub_sh (__m128h __W, __m128h __A, __m128h __B)
+{
+  return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W,
+                                                 -(__v8hf) __A,
+                                                 -(__v8hf) __B,
+                                                 (__mmask8) -1,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmsub_sh (__m128h __W, __mmask8 __U, __m128h __A, __m128h __B)
+{
+  return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W,
+                                                 -(__v8hf) __A,
+                                                 -(__v8hf) __B,
+                                                 (__mmask8) __U,
+                                                 _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmsub_sh (__m128h __W, __m128h __A, __m128h __B, __mmask8 __U)
+{
+  return (__m128h) __builtin_ia32_vfmsubsh3_mask3 ((__v8hf) __W,
+                                                  -(__v8hf) __A,
+                                                  (__v8hf) __B,
+                                                  (__mmask8) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmsub_sh (__mmask8 __U, __m128h __W, __m128h __A, __m128h __B)
+{
+  return (__m128h) __builtin_ia32_vfmaddsh3_maskz ((__v8hf) __W,
+                                                  -(__v8hf) __A,
+                                                  -(__v8hf) __B,
+                                                  (__mmask8) __U,
+                                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmsub_round_sh (__m128h __W, __m128h __A, __m128h __B, const int __R)
+{
+  return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W,
+                                                 -(__v8hf) __A,
+                                                 -(__v8hf) __B,
+                                                 (__mmask8) -1,
+                                                 __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmsub_round_sh (__m128h __W, __mmask8 __U, __m128h __A, __m128h __B,
+                        const int __R)
+{
+  return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W,
+                                                 -(__v8hf) __A,
+                                                 -(__v8hf) __B,
+                                                 (__mmask8) __U, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmsub_round_sh (__m128h __W, __m128h __A, __m128h __B, __mmask8 __U,
+                         const int __R)
+{
+  return (__m128h) __builtin_ia32_vfmsubsh3_mask3 ((__v8hf) __W,
+                                                  -(__v8hf) __A,
+                                                  (__v8hf) __B,
+                                                  (__mmask8) __U, __R);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmsub_round_sh (__mmask8 __U, __m128h __W, __m128h __A,
+                         __m128h __B, const int __R)
+{
+  return (__m128h) __builtin_ia32_vfmaddsh3_maskz ((__v8hf) __W,
+                                                  -(__v8hf) __A,
+                                                  -(__v8hf) __B,
+                                                  (__mmask8) __U, __R);
+}
+
+#else
+#define _mm_fnmsub_round_sh(A, B, C, R)                                        \
+  ((__m128h) __builtin_ia32_vfmaddsh3_mask ((A), -(B), -(C), (-1), (R)))
+#define _mm_mask_fnmsub_round_sh(A, U, B, C, R)                                \
+  ((__m128h) __builtin_ia32_vfmaddsh3_mask ((A), -(B), -(C), (U), (R)))
+#define _mm_mask3_fnmsub_round_sh(A, B, C, U, R)                       \
+  ((__m128h) __builtin_ia32_vfmsubsh3_mask3 ((A), -(B), (C), (U), (R)))
+#define _mm_maskz_fnmsub_round_sh(U, A, B, C, R)                       \
+  ((__m128h) __builtin_ia32_vfmaddsh3_maskz ((A), -(B), -(C), (U), (R)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vf[,c]maddcph.  */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fcmadd_pch (__m512h __A, __m512h __B, __m512h __C)
+{
+  return (__m512h)
+    __builtin_ia32_vfcmaddcph512_round ((__v32hf) __A,
+                                       (__v32hf) __B,
+                                       (__v32hf) __C,
+                                       _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fcmadd_pch (__m512h __A, __mmask16 __B, __m512h __C, __m512h __D)
+{
+  return (__m512h)
+    __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) __A,
+                                            (__v32hf) __C,
+                                            (__v32hf) __D, __B,
+                                            _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fcmadd_pch (__m512h __A, __m512h __B, __m512h __C, __mmask16 __D)
+{
+  return (__m512h)
+    __builtin_ia32_vfcmaddcph512_mask3_round ((__v32hf) __A,
+                                             (__v32hf) __B,
+                                             (__v32hf) __C,
+                                             __D, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fcmadd_pch (__mmask16 __A, __m512h __B, __m512h __C, __m512h __D)
+{
+  return (__m512h)
+    __builtin_ia32_vfcmaddcph512_maskz_round ((__v32hf) __B,
+                                             (__v32hf) __C,
+                                             (__v32hf) __D,
+                                             __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmadd_pch (__m512h __A, __m512h __B, __m512h __C)
+{
+  return (__m512h)
+    __builtin_ia32_vfmaddcph512_round ((__v32hf) __A,
+                                      (__v32hf) __B,
+                                      (__v32hf) __C,
+                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmadd_pch (__m512h __A, __mmask16 __B, __m512h __C, __m512h __D)
+{
+  return (__m512h)
+    __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) __A,
+                                           (__v32hf) __C,
+                                           (__v32hf) __D, __B,
+                                           _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmadd_pch (__m512h __A, __m512h __B, __m512h __C, __mmask16 __D)
+{
+  return (__m512h)
+    __builtin_ia32_vfmaddcph512_mask3_round ((__v32hf) __A,
+                                            (__v32hf) __B,
+                                            (__v32hf) __C,
+                                            __D, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmadd_pch (__mmask16 __A, __m512h __B, __m512h __C, __m512h __D)
+{
+  return (__m512h)
+    __builtin_ia32_vfmaddcph512_maskz_round ((__v32hf) __B,
+                                            (__v32hf) __C,
+                                            (__v32hf) __D,
+                                            __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fcmadd_round_pch (__m512h __A, __m512h __B, __m512h __C, const int __D)
+{
+  return (__m512h)
+    __builtin_ia32_vfcmaddcph512_round ((__v32hf) __A,
+                                       (__v32hf) __B,
+                                       (__v32hf) __C,
+                                       __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fcmadd_round_pch (__m512h __A, __mmask16 __B, __m512h __C,
+                             __m512h __D, const int __E)
+{
+  return (__m512h)
+    __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) __A,
+                                            (__v32hf) __C,
+                                            (__v32hf) __D, __B,
+                                            __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fcmadd_round_pch (__m512h __A, __m512h __B, __m512h __C,
+                              __mmask16 __D, const int __E)
+{
+  return (__m512h)
+    __builtin_ia32_vfcmaddcph512_mask3_round ((__v32hf) __A,
+                                             (__v32hf) __B,
+                                             (__v32hf) __C,
+                                             __D, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fcmadd_round_pch (__mmask16 __A, __m512h __B, __m512h __C,
+                              __m512h __D, const int __E)
+{
+  return (__m512h)
+    __builtin_ia32_vfcmaddcph512_maskz_round ((__v32hf) __B,
+                                             (__v32hf) __C,
+                                             (__v32hf) __D,
+                                             __A, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmadd_round_pch (__m512h __A, __m512h __B, __m512h __C, const int __D)
+{
+  return (__m512h)
+    __builtin_ia32_vfmaddcph512_round ((__v32hf) __A,
+                                      (__v32hf) __B,
+                                      (__v32hf) __C,
+                                      __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmadd_round_pch (__m512h __A, __mmask16 __B, __m512h __C,
+                            __m512h __D, const int __E)
+{
+  return (__m512h)
+    __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) __A,
+                                           (__v32hf) __C,
+                                           (__v32hf) __D, __B,
+                                           __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask3_fmadd_round_pch (__m512h __A, __m512h __B, __m512h __C,
+                             __mmask16 __D, const int __E)
+{
+  return (__m512h)
+    __builtin_ia32_vfmaddcph512_mask3_round ((__v32hf) __A,
+                                            (__v32hf) __B,
+                                            (__v32hf) __C,
+                                            __D, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmadd_round_pch (__mmask16 __A, __m512h __B, __m512h __C,
+                             __m512h __D, const int __E)
+{
+  return (__m512h)
+    __builtin_ia32_vfmaddcph512_maskz_round ((__v32hf) __B,
+                                            (__v32hf) __C,
+                                            (__v32hf) __D,
+                                            __A, __E);
+}
+
+#else
+#define _mm512_fcmadd_round_pch(A, B, C, D)                    \
+  (__m512h) __builtin_ia32_vfcmaddcph512_round ((A), (B), (C), (D))
+
+#define _mm512_mask_fcmadd_round_pch(A, B, C, D, E)                    \
+  ((__m512h)                                                           \
+    __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) (A),            \
+                                            (__v32hf) (C),             \
+                                            (__v32hf) (D),             \
+                                            (B), (E)))
+
+
+#define _mm512_mask3_fcmadd_round_pch(A, B, C, D, E)                   \
+  ((__m512h)                                                           \
+   __builtin_ia32_vfcmaddcph512_mask3_round ((A), (B), (C), (D), (E)))
+
+#define _mm512_maskz_fcmadd_round_pch(A, B, C, D, E)                   \
+  (__m512h)                                                            \
+   __builtin_ia32_vfcmaddcph512_maskz_round ((B), (C), (D), (A), (E))
+
+#define _mm512_fmadd_round_pch(A, B, C, D)                     \
+  (__m512h) __builtin_ia32_vfmaddcph512_round ((A), (B), (C), (D))
+
+#define _mm512_mask_fmadd_round_pch(A, B, C, D, E)                     \
+  ((__m512h)                                                           \
+    __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) (A),             \
+                                           (__v32hf) (C),              \
+                                           (__v32hf) (D),              \
+                                           (B), (E)))
+
+#define _mm512_mask3_fmadd_round_pch(A, B, C, D, E)                    \
+  (__m512h)                                                            \
+   __builtin_ia32_vfmaddcph512_mask3_round ((A), (B), (C), (D), (E))
+
+#define _mm512_maskz_fmadd_round_pch(A, B, C, D, E)                    \
+  (__m512h)                                                            \
+   __builtin_ia32_vfmaddcph512_maskz_round ((B), (C), (D), (A), (E))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vf[,c]mulcph.  */
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fcmul_pch (__m512h __A, __m512h __B)
+{
+  return (__m512h)
+    __builtin_ia32_vfcmulcph512_round ((__v32hf) __A,
+                                      (__v32hf) __B,
+                                      _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fcmul_pch (__m512h __A, __mmask16 __B, __m512h __C, __m512h __D)
+{
+  return (__m512h)
+    __builtin_ia32_vfcmulcph512_mask_round ((__v32hf) __C,
+                                           (__v32hf) __D,
+                                           (__v32hf) __A,
+                                           __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fcmul_pch (__mmask16 __A, __m512h __B, __m512h __C)
+{
+  return (__m512h)
+    __builtin_ia32_vfcmulcph512_mask_round ((__v32hf) __B,
+                                           (__v32hf) __C,
+                                           _mm512_setzero_ph (),
+                                           __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmul_pch (__m512h __A, __m512h __B)
+{
+  return (__m512h)
+    __builtin_ia32_vfmulcph512_round ((__v32hf) __A,
+                                     (__v32hf) __B,
+                                     _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmul_pch (__m512h __A, __mmask16 __B, __m512h __C, __m512h __D)
+{
+  return (__m512h)
+    __builtin_ia32_vfmulcph512_mask_round ((__v32hf) __C,
+                                          (__v32hf) __D,
+                                          (__v32hf) __A,
+                                          __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmul_pch (__mmask16 __A, __m512h __B, __m512h __C)
+{
+  return (__m512h)
+    __builtin_ia32_vfmulcph512_mask_round ((__v32hf) __B,
+                                          (__v32hf) __C,
+                                          _mm512_setzero_ph (),
+                                          __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fcmul_round_pch (__m512h __A, __m512h __B, const int __D)
+{
+  return (__m512h)
+    __builtin_ia32_vfcmulcph512_round ((__v32hf) __A,
+                                      (__v32hf) __B, __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fcmul_round_pch (__m512h __A, __mmask16 __B, __m512h __C,
+                            __m512h __D, const int __E)
+{
+  return (__m512h)
+    __builtin_ia32_vfcmulcph512_mask_round ((__v32hf) __C,
+                                           (__v32hf) __D,
+                                           (__v32hf) __A,
+                                           __B, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fcmul_round_pch (__mmask16 __A, __m512h __B,
+                             __m512h __C, const int __E)
+{
+  return (__m512h)
+    __builtin_ia32_vfcmulcph512_mask_round ((__v32hf) __B,
+                                           (__v32hf) __C,
+                                           _mm512_setzero_ph (),
+                                           __A, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_fmul_round_pch (__m512h __A, __m512h __B, const int __D)
+{
+  return (__m512h)
+    __builtin_ia32_vfmulcph512_round ((__v32hf) __A,
+                                     (__v32hf) __B,
+                                     __D);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_fmul_round_pch (__m512h __A, __mmask16 __B, __m512h __C,
+                           __m512h __D, const int __E)
+{
+  return (__m512h)
+    __builtin_ia32_vfmulcph512_mask_round ((__v32hf) __C,
+                                          (__v32hf) __D,
+                                          (__v32hf) __A,
+                                          __B, __E);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_fmul_round_pch (__mmask16 __A, __m512h __B,
+                            __m512h __C, const int __E)
+{
+  return (__m512h)
+    __builtin_ia32_vfmulcph512_mask_round ((__v32hf) __B,
+                                          (__v32hf) __C,
+                                          _mm512_setzero_ph (),
+                                          __A, __E);
+}
+
+#else
+#define _mm512_fcmul_round_pch(A, B, D)                                \
+  (__m512h) __builtin_ia32_vfcmulcph512_round ((A), (B), (D))
+
+#define _mm512_mask_fcmul_round_pch(A, B, C, D, E)                     \
+  (__m512h) __builtin_ia32_vfcmulcph512_mask_round ((C), (D), (A), (B), (E))
+
+#define _mm512_maskz_fcmul_round_pch(A, B, C, E)                       \
+  (__m512h) __builtin_ia32_vfcmulcph512_mask_round ((B), (C),          \
+                                                   (__v32hf)           \
+                                                   _mm512_setzero_ph (), \
+                                                   (A), (E))
+
+#define _mm512_fmul_round_pch(A, B, D)                 \
+  (__m512h) __builtin_ia32_vfmulcph512_round ((A), (B), (D))
+
+#define _mm512_mask_fmul_round_pch(A, B, C, D, E)                        \
+  (__m512h) __builtin_ia32_vfmulcph512_mask_round ((C), (D), (A), (B), (E))
+
+#define _mm512_maskz_fmul_round_pch(A, B, C, E)                                  \
+  (__m512h) __builtin_ia32_vfmulcph512_mask_round ((B), (C),             \
+                                                  (__v32hf)              \
+                                                  _mm512_setzero_ph (),  \
+                                                  (A), (E))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vf[,c]maddcsh.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fcmadd_sch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return (__m128h)
+    __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A,
+                                         (__v8hf) __C,
+                                         (__v8hf) __D, __B,
+                                         _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fcmadd_sch (__m128h __A, __m128h __B, __m128h __C, __mmask8 __D)
+{
+  return (__m128h)
+    __builtin_ia32_vfcmaddcsh_mask3_round ((__v8hf) __A,
+                                          (__v8hf) __B,
+                                          (__v8hf) __C, __D,
+                                          _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fcmadd_sch (__mmask8 __A, __m128h __B, __m128h __C, __m128h __D)
+{
+  return (__m128h)
+    __builtin_ia32_vfcmaddcsh_maskz_round ((__v8hf) __B,
+                                          (__v8hf) __C,
+                                          (__v8hf) __D,
+                                          __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fcmadd_sch (__m128h __A, __m128h __B, __m128h __C)
+{
+  return (__m128h)
+    __builtin_ia32_vfcmaddcsh_round ((__v8hf) __A,
+                                    (__v8hf) __B,
+                                    (__v8hf) __C,
+                                    _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmadd_sch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return (__m128h)
+    __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A,
+                                        (__v8hf) __C,
+                                        (__v8hf) __D, __B,
+                                        _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmadd_sch (__m128h __A, __m128h __B, __m128h __C, __mmask8 __D)
+{
+  return (__m128h)
+    __builtin_ia32_vfmaddcsh_mask3_round ((__v8hf) __A,
+                                         (__v8hf) __B,
+                                         (__v8hf) __C, __D,
+                                         _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmadd_sch (__mmask8 __A, __m128h __B, __m128h __C, __m128h __D)
+{
+  return (__m128h)
+    __builtin_ia32_vfmaddcsh_maskz_round ((__v8hf) __B,
+                                         (__v8hf) __C,
+                                         (__v8hf) __D,
+                                         __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_sch (__m128h __A, __m128h __B, __m128h __C)
+{
+  return (__m128h)
+    __builtin_ia32_vfmaddcsh_round ((__v8hf) __A,
+                                   (__v8hf) __B,
+                                   (__v8hf) __C,
+                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fcmadd_round_sch (__m128h __A, __mmask8 __B, __m128h __C,
+                          __m128h __D, const int __E)
+{
+  return (__m128h)
+    __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A,
+                                         (__v8hf) __C,
+                                         (__v8hf) __D,
+                                         __B, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fcmadd_round_sch (__m128h __A, __m128h __B, __m128h __C,
+                           __mmask8 __D, const int __E)
+{
+  return (__m128h)
+    __builtin_ia32_vfcmaddcsh_mask3_round ((__v8hf) __A,
+                                          (__v8hf) __B,
+                                          (__v8hf) __C,
+                                          __D, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fcmadd_round_sch (__mmask8 __A, __m128h __B, __m128h __C,
+                           __m128h __D, const int __E)
+{
+  return (__m128h)
+    __builtin_ia32_vfcmaddcsh_maskz_round ((__v8hf) __B,
+                                          (__v8hf) __C,
+                                          (__v8hf) __D,
+                                          __A, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fcmadd_round_sch (__m128h __A, __m128h __B, __m128h __C, const int __D)
+{
+  return (__m128h)
+    __builtin_ia32_vfcmaddcsh_round ((__v8hf) __A,
+                                    (__v8hf) __B,
+                                    (__v8hf) __C,
+                                    __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmadd_round_sch (__m128h __A, __mmask8 __B, __m128h __C,
+                         __m128h __D, const int __E)
+{
+  return (__m128h)
+    __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A,
+                                        (__v8hf) __C,
+                                        (__v8hf) __D,
+                                        __B, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmadd_round_sch (__m128h __A, __m128h __B, __m128h __C,
+                          __mmask8 __D, const int __E)
+{
+  return (__m128h)
+    __builtin_ia32_vfmaddcsh_mask3_round ((__v8hf) __A,
+                                         (__v8hf) __B,
+                                         (__v8hf) __C,
+                                         __D, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmadd_round_sch (__mmask8 __A, __m128h __B, __m128h __C,
+                          __m128h __D, const int __E)
+{
+  return (__m128h)
+    __builtin_ia32_vfmaddcsh_maskz_round ((__v8hf) __B,
+                                         (__v8hf) __C,
+                                         (__v8hf) __D,
+                                         __A, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_round_sch (__m128h __A, __m128h __B, __m128h __C, const int __D)
+{
+  return (__m128h)
+    __builtin_ia32_vfmaddcsh_round ((__v8hf) __A,
+                                   (__v8hf) __B,
+                                   (__v8hf) __C,
+                                   __D);
+}
+#else
+#define _mm_mask_fcmadd_round_sch(A, B, C, D, E)                       \
+    ((__m128h)                                                         \
+     __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) (A),               \
+                                          (__v8hf) (C),                \
+                                          (__v8hf) (D),                \
+                                          (B), (E)))
+
+
+#define _mm_mask3_fcmadd_round_sch(A, B, C, D, E)                      \
+  ((__m128h)                                                           \
+   __builtin_ia32_vfcmaddcsh_mask3_round ((__v8hf) (A),                \
+                                         (__v8hf) (B),         \
+                                         (__v8hf) (C),         \
+                                         (D), (E)))
+
+#define _mm_maskz_fcmadd_round_sch(A, B, C, D, E)              \
+  __builtin_ia32_vfcmaddcsh_maskz_round ((B), (C), (D), (A), (E))
+
+#define _mm_fcmadd_round_sch(A, B, C, D)               \
+  __builtin_ia32_vfcmaddcsh_round ((A), (B), (C), (D))
+
+#define _mm_mask_fmadd_round_sch(A, B, C, D, E)                                \
+    ((__m128h)                                                         \
+     __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) (A),                \
+                                         (__v8hf) (C),         \
+                                         (__v8hf) (D),         \
+                                         (B), (E)))
+
+#define _mm_mask3_fmadd_round_sch(A, B, C, D, E)                       \
+  ((__m128h)                                                           \
+   __builtin_ia32_vfmaddcsh_mask3_round ((__v8hf) (A),         \
+                                        (__v8hf) (B),          \
+                                        (__v8hf) (C),          \
+                                        (D), (E)))
+
+#define _mm_maskz_fmadd_round_sch(A, B, C, D, E)               \
+  __builtin_ia32_vfmaddcsh_maskz_round ((B), (C), (D), (A), (E))
+
+#define _mm_fmadd_round_sch(A, B, C, D)                \
+  __builtin_ia32_vfmaddcsh_round ((A), (B), (C), (D))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vf[,c]mulcsh.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fcmul_sch (__m128h __A, __m128h __B)
+{
+  return (__m128h)
+    __builtin_ia32_vfcmulcsh_round ((__v8hf) __A,
+                                   (__v8hf) __B,
+                                   _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fcmul_sch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return (__m128h)
+    __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __C,
+                                        (__v8hf) __D,
+                                        (__v8hf) __A,
+                                        __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fcmul_sch (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return (__m128h)
+    __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __B,
+                                        (__v8hf) __C,
+                                        _mm_setzero_ph (),
+                                        __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmul_sch (__m128h __A, __m128h __B)
+{
+  return (__m128h)
+    __builtin_ia32_vfmulcsh_round ((__v8hf) __A,
+                                  (__v8hf) __B,
+                                  _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmul_sch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return (__m128h)
+    __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __C,
+                                       (__v8hf) __D,
+                                       (__v8hf) __A,
+                                       __B, _MM_FROUND_CUR_DIRECTION);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmul_sch (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return (__m128h)
+    __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __B,
+                                       (__v8hf) __C,
+                                       _mm_setzero_ph (),
+                                       __A, _MM_FROUND_CUR_DIRECTION);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fcmul_round_sch (__m128h __A, __m128h __B, const int __D)
+{
+  return (__m128h)
+    __builtin_ia32_vfcmulcsh_round ((__v8hf) __A,
+                                   (__v8hf) __B,
+                                   __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fcmul_round_sch (__m128h __A, __mmask8 __B, __m128h __C,
+                         __m128h __D, const int __E)
+{
+  return (__m128h)
+    __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __C,
+                                        (__v8hf) __D,
+                                        (__v8hf) __A,
+                                        __B, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fcmul_round_sch (__mmask8 __A, __m128h __B, __m128h __C,
+                          const int __E)
+{
+  return (__m128h)
+    __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __B,
+                                        (__v8hf) __C,
+                                        _mm_setzero_ph (),
+                                        __A, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmul_round_sch (__m128h __A, __m128h __B, const int __D)
+{
+  return (__m128h)
+    __builtin_ia32_vfmulcsh_round ((__v8hf) __A,
+                                  (__v8hf) __B, __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmul_round_sch (__m128h __A, __mmask8 __B, __m128h __C,
+                        __m128h __D, const int __E)
+{
+  return (__m128h)
+    __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __C,
+                                       (__v8hf) __D,
+                                       (__v8hf) __A,
+                                       __B, __E);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmul_round_sch (__mmask8 __A, __m128h __B, __m128h __C, const int __E)
+{
+  return (__m128h)
+    __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __B,
+                                       (__v8hf) __C,
+                                       _mm_setzero_ph (),
+                                       __A, __E);
+}
+
+#else
+#define _mm_fcmul_round_sch(__A, __B, __D)                             \
+  (__m128h) __builtin_ia32_vfcmulcsh_round ((__v8hf) __A,              \
+                                           (__v8hf) __B, __D)
+
+#define _mm_mask_fcmul_round_sch(__A, __B, __C, __D, __E)              \
+  (__m128h) __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __C,         \
+                                                (__v8hf) __D,          \
+                                                (__v8hf) __A,          \
+                                                __B, __E)
+
+#define _mm_maskz_fcmul_round_sch(__A, __B, __C, __E)                  \
+  (__m128h) __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __B,         \
+                                                (__v8hf) __C,          \
+                                                _mm_setzero_ph (),     \
+                                                __A, __E)
+
+#define _mm_fmul_round_sch(__A, __B, __D)                              \
+  (__m128h) __builtin_ia32_vfmulcsh_round ((__v8hf) __A,               \
+                                          (__v8hf) __B, __D)
+
+#define _mm_mask_fmul_round_sch(__A, __B, __C, __D, __E)               \
+  (__m128h) __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __C,          \
+                                               (__v8hf) __D,           \
+                                               (__v8hf) __A,           \
+                                               __B, __E)
+
+#define _mm_maskz_fmul_round_sch(__A, __B, __C, __E)                   \
+  (__m128h) __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __B,          \
+                                               (__v8hf) __C,           \
+                                               _mm_setzero_ph (),      \
+                                               __A, __E)
+
+#endif /* __OPTIMIZE__ */
+
+#define _MM512_REDUCE_OP(op)                                           \
+  __m256h __T1 = (__m256h) _mm512_extractf64x4_pd ((__m512d) __A, 0);  \
+  __m256h __T2 = (__m256h) _mm512_extractf64x4_pd ((__m512d) __A, 1);  \
+  __m256h __T3 = (__T1 op __T2);                                       \
+  __m128h __T4 = (__m128h) _mm256_extractf128_pd ((__m256d) __T3, 0);  \
+  __m128h __T5 = (__m128h) _mm256_extractf128_pd ((__m256d) __T3, 1);  \
+  __m128h __T6 = (__T4 op __T5);                                       \
+  __m128h __T7 = (__m128h) __builtin_shuffle ((__m128h)__T6,           \
+                (__v8hi) { 4, 5, 6, 7, 0, 1, 2, 3 });                  \
+  __m128h __T8 = (__T6 op __T7);                                       \
+  __m128h __T9 = (__m128h) __builtin_shuffle ((__m128h)__T8,           \
+                (__v8hi) { 2, 3, 0, 1, 4, 5, 6, 7 });                  \
+  __m128h __T10 = __T8 op __T9;                                        \
+  return __T10[0] op __T10[1]
+
+// TODO reduce
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_add_ph (__m512h __A)
+{
+   _MM512_REDUCE_OP (+);
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_mul_ph (__m512h __A)
+{
+   _MM512_REDUCE_OP (*);
+}
+
+#undef _MM512_REDUCE_OP
+
+#ifdef __AVX512VL__
+
+#define _MM512_REDUCE_OP(op)                                           \
+  __m256h __T1 = (__m256h) _mm512_extractf64x4_pd ((__m512d) __A, 0);  \
+  __m256h __T2 = (__m256h) _mm512_extractf64x4_pd ((__m512d) __A, 1);  \
+  __m256h __T3 = __builtin_ia32_##op##ph256_mask (__T1, __T2,          \
+                _mm256_setzero_ph (), (__mmask16) -1);         \
+  __m128h __T4 = (__m128h) _mm256_extractf128_pd ((__m256d) __T3, 0);  \
+  __m128h __T5 = (__m128h) _mm256_extractf128_pd ((__m256d) __T3, 1);  \
+  __m128h __T6 = __builtin_ia32_##op##ph128_mask                       \
+                (__T4, __T5, _mm_setzero_ph (),(__mmask8) -1); \
+  __m128h __T7 = (__m128h) __builtin_shuffle ((__m128h)__T6,           \
+                (__v8hi) { 2, 3, 0, 1, 6, 7, 4, 5 });                  \
+  __m128h __T8 = (__m128h)  __builtin_ia32_##op##ph128_mask            \
+                (__T6, __T7, _mm_setzero_ph (),(__mmask8) -1); \
+  __m128h __T9 = (__m128h) __builtin_shuffle ((__m128h)__T8,           \
+                (__v8hi) { 4, 5 });                                    \
+  __m128h __T10 = __builtin_ia32_##op##ph128_mask                      \
+                 (__T8, __T9, _mm_setzero_ph (),(__mmask8) -1);        \
+  __m128h __T11 = (__m128h) __builtin_shuffle (__T10,                  \
+                 (__v8hi) { 1, 0 });                                   \
+  __m128h __T12 = __builtin_ia32_##op##ph128_mask                      \
+                 (__T10, __T11, _mm_setzero_ph (),(__mmask8) -1);      \
+  return __T12[0]
+
+#else
+
+#define _MM512_REDUCE_OP(op)                                           \
+  __m512h __T1 = (__m512h) __builtin_shuffle ((__m512d) __A,           \
+                (__v8di) { 4, 5, 6, 7, 0, 0, 0, 0 });                  \
+  __m512h __T2 = _mm512_##op##_ph (__A, __T1);                         \
+  __m512h __T3 = (__m512h) __builtin_shuffle ((__m512d) __T2,          \
+                (__v8di) { 2, 3, 0, 0, 0, 0, 0, 0 });                  \
+  __m512h __T4 = _mm512_##op##_ph (__T2, __T3);                        \
+  __m512h __T5 = (__m512h) __builtin_shuffle ((__m512d) __T4,          \
+                (__v8di) { 1, 0, 0, 0, 0, 0, 0, 0 });                  \
+  __m512h __T6 = _mm512_##op##_ph (__T4, __T5);                        \
+  __m512h __T7 = (__m512h) __builtin_shuffle ((__m512) __T6,           \
+                (__v16si) { 1, 0, 0, 0, 0, 0, 0, 0,                    \
+                            0, 0, 0, 0, 0, 0, 0, 0 });         \
+  __m512h __T8 = _mm512_##op##_ph (__T6, __T7);                        \
+  __m512h __T9 = (__m512h) __builtin_shuffle (__T8,                    \
+                (__v32hi) { 1, 0, 0, 0, 0, 0, 0, 0,                    \
+                            0, 0, 0, 0, 0, 0, 0, 0,                    \
+                            0, 0, 0, 0, 0, 0, 0, 0,                    \
+                            0, 0, 0, 0, 0, 0, 0, 0 });         \
+  __m512h __T10 = _mm512_##op##_ph (__T8, __T9);                       \
+  return __T10[0]
+#endif
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_min_ph (__m512h __A)
+{
+  _MM512_REDUCE_OP (min);
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_reduce_max_ph (__m512h __A)
+{
+  _MM512_REDUCE_OP (max);
+}
+
+#undef _MM512_REDUCE_OP
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_blend_ph (__mmask32 __U, __m512h __A, __m512h __W)
+{
+  return (__m512h) __builtin_ia32_movdquhi512_mask ((__v32hi) __W,
+                                                   (__v32hi) __A,
+                                                   (__mmask32) __U);
+
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutex2var_ph (__m512h __A, __m512i __I, __m512h __B)
+{
+  return (__m512h) __builtin_ia32_vpermi2varhi512_mask ((__v32hi) __A,
+                                                      (__v32hi) __I,
+                                                      (__v32hi) __B,
+                                                      (__mmask32)-1);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutexvar_ph (__m512i __A, __m512h __B)
+{
+  return (__m512h) __builtin_ia32_permvarhi512_mask ((__v32hi) __B,
+                                                    (__v32hi) __A,
+                                                    (__v32hi)
+                                                    (_mm512_setzero_ph ()),
+                                                    (__mmask32)-1);
+}
+
+extern __inline __m512h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_set1_pch (_Float16 _Complex __A)
+{
+  union
+  {
+    _Float16 _Complex __a;
+    float __b;
+  } __u = { .__a = __A};
+
+  return (__m512h) _mm512_set1_ps (__u.__b);
+}
+
+// intrinsics below are alias for f*mul_*ch
+#define _mm512_mul_pch(A, B) _mm512_fmul_pch ((A), (B))
+#define _mm512_mask_mul_pch(W, U, A, B)                                      \
+  _mm512_mask_fmul_pch ((W), (U), (A), (B))
+#define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch ((U), (A), (B))
+#define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch ((A), (B), (R))
+#define _mm512_mask_mul_round_pch(W, U, A, B, R)                     \
+  _mm512_mask_fmul_round_pch ((W), (U), (A), (B), (R))
+#define _mm512_maskz_mul_round_pch(U, A, B, R)                       \
+  _mm512_maskz_fmul_round_pch ((U), (A), (B), (R))
+
+#define _mm512_cmul_pch(A, B) _mm512_fcmul_pch ((A), (B))
+#define _mm512_mask_cmul_pch(W, U, A, B)                             \
+  _mm512_mask_fcmul_pch ((W), (U), (A), (B))
+#define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch ((U), (A), (B))
+#define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch ((A), (B), (R))
+#define _mm512_mask_cmul_round_pch(W, U, A, B, R)                    \
+  _mm512_mask_fcmul_round_pch ((W), (U), (A), (B), (R))
+#define _mm512_maskz_cmul_round_pch(U, A, B, R)                              \
+  _mm512_maskz_fcmul_round_pch ((U), (A), (B), (R))
+
+#define _mm_mul_sch(A, B) _mm_fmul_sch ((A), (B))
+#define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch ((W), (U), (A), (B))
+#define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch ((U), (A), (B))
+#define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch ((A), (B), (R))
+#define _mm_mask_mul_round_sch(W, U, A, B, R)                        \
+  _mm_mask_fmul_round_sch ((W), (U), (A), (B), (R))
+#define _mm_maskz_mul_round_sch(U, A, B, R)                          \
+  _mm_maskz_fmul_round_sch ((U), (A), (B), (R))
+
+#define _mm_cmul_sch(A, B) _mm_fcmul_sch ((A), (B))
+#define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch ((W), (U), (A), (B))
+#define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch ((U), (A), (B))
+#define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch ((A), (B), (R))
+#define _mm_mask_cmul_round_sch(W, U, A, B, R)                       \
+  _mm_mask_fcmul_round_sch ((W), (U), (A), (B), (R))
+#define _mm_maskz_cmul_round_sch(U, A, B, R)                         \
+  _mm_maskz_fcmul_round_sch ((U), (A), (B), (R))
+
+#ifdef __DISABLE_AVX512FP16__
+#undef __DISABLE_AVX512FP16__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512FP16__ */
+
+#endif /* __AVX512FP16INTRIN_H_INCLUDED */
diff --git a/include-gcc/avx512fp16vlintrin.h b/include-gcc/avx512fp16vlintrin.h
new file mode 100644 (file)
index 0000000..308b0b2
--- /dev/null
@@ -0,0 +1,3362 @@
+/* Copyright (C) 2019-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512fp16vlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512FP16VLINTRIN_H_INCLUDED
+#define __AVX512FP16VLINTRIN_H_INCLUDED
+
+#if !defined(__AVX512VL__) || !defined(__AVX512FP16__)
+#pragma GCC push_options
+#pragma GCC target("avx512fp16,avx512vl")
+#define __DISABLE_AVX512FP16VL__
+#endif /* __AVX512FP16VL__ */
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castph_ps (__m128h __a)
+{
+  return (__m128) __a;
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castph_ps (__m256h __a)
+{
+  return (__m256) __a;
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castph_pd (__m128h __a)
+{
+  return (__m128d) __a;
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castph_pd (__m256h __a)
+{
+  return (__m256d) __a;
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castph_si128 (__m128h __a)
+{
+  return (__m128i) __a;
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castph_si256 (__m256h __a)
+{
+  return (__m256i) __a;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castps_ph (__m128 __a)
+{
+  return (__m128h) __a;
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castps_ph (__m256 __a)
+{
+  return (__m256h) __a;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castpd_ph (__m128d __a)
+{
+  return (__m128h) __a;
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castpd_ph (__m256d __a)
+{
+  return (__m256h) __a;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castsi128_ph (__m128i __a)
+{
+  return (__m128h) __a;
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castsi256_ph (__m256i __a)
+{
+  return (__m256h) __a;
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castph256_ph128 (__m256h __A)
+{
+  union
+  {
+    __m128h __a[2];
+    __m256h __v;
+  } __u = { .__v = __A };
+  return __u.__a[0];
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castph128_ph256 (__m128h __A)
+{
+  union
+  {
+    __m128h __a[2];
+    __m256h __v;
+  } __u;
+  __u.__a[0] = __A;
+  return __u.__v;
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_zextph128_ph256 (__m128h __A)
+{
+  return (__m256h) _mm256_insertf128_ps (_mm256_setzero_ps (),
+                                        (__m128) __A, 0);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_conj_pch (__m256h __A)
+{
+  return (__m256h) _mm256_xor_epi32 ((__m256i) __A, _mm256_set1_epi32 (1<<31));
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_conj_pch (__m256h __W, __mmask8 __U, __m256h __A)
+{
+  return (__m256h) __builtin_ia32_movaps256_mask ((__v8sf)
+                                                  _mm256_conj_pch (__A),
+                                                 (__v8sf) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_conj_pch (__mmask8 __U, __m256h __A)
+{
+  return (__m256h) __builtin_ia32_movaps256_mask ((__v8sf)
+                                                  _mm256_conj_pch (__A),
+                                                 (__v8sf)
+                                                  _mm256_setzero_ps (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_conj_pch (__m128h __A)
+{
+  return (__m128h) _mm_xor_epi32 ((__m128i) __A, _mm_set1_epi32 (1<<31));
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_conj_pch (__m128h __W, __mmask8 __U, __m128h __A)
+{
+  return (__m128h) __builtin_ia32_movaps128_mask ((__v4sf) _mm_conj_pch (__A),
+                                                 (__v4sf) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_conj_pch (__mmask8 __U, __m128h __A)
+{
+  return (__m128h) __builtin_ia32_movaps128_mask ((__v4sf) _mm_conj_pch (__A),
+                                                 (__v4sf) _mm_setzero_ps (),
+                                                 (__mmask8) __U);
+}
+
+/* Intrinsics v[add,sub,mul,div]ph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_ph (__m128h __A, __m128h __B)
+{
+  return (__m128h) ((__v8hf) __A + (__v8hf) __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_add_ph (__m256h __A, __m256h __B)
+{
+  return (__m256h) ((__v16hf) __A + (__v16hf) __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_add_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return __builtin_ia32_addph128_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_add_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
+{
+  return __builtin_ia32_addph256_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_add_ph (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return __builtin_ia32_addph128_mask (__B, __C, _mm_setzero_ph (),
+                                      __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_add_ph (__mmask16 __A, __m256h __B, __m256h __C)
+{
+  return __builtin_ia32_addph256_mask (__B, __C,
+                                      _mm256_setzero_ph (), __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_ph (__m128h __A, __m128h __B)
+{
+  return (__m128h) ((__v8hf) __A - (__v8hf) __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sub_ph (__m256h __A, __m256h __B)
+{
+  return (__m256h) ((__v16hf) __A - (__v16hf) __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sub_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return __builtin_ia32_subph128_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sub_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
+{
+  return __builtin_ia32_subph256_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sub_ph (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return __builtin_ia32_subph128_mask (__B, __C, _mm_setzero_ph (),
+                                      __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sub_ph (__mmask16 __A, __m256h __B, __m256h __C)
+{
+  return __builtin_ia32_subph256_mask (__B, __C,
+                                      _mm256_setzero_ph (), __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_ph (__m128h __A, __m128h __B)
+{
+  return (__m128h) ((__v8hf) __A * (__v8hf) __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mul_ph (__m256h __A, __m256h __B)
+{
+  return (__m256h) ((__v16hf) __A * (__v16hf) __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mul_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return __builtin_ia32_mulph128_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mul_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
+{
+  return __builtin_ia32_mulph256_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mul_ph (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return __builtin_ia32_mulph128_mask (__B, __C, _mm_setzero_ph (),
+                                      __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mul_ph (__mmask16 __A, __m256h __B, __m256h __C)
+{
+  return __builtin_ia32_mulph256_mask (__B, __C,
+                                      _mm256_setzero_ph (), __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_ph (__m128h __A, __m128h __B)
+{
+  return (__m128h) ((__v8hf) __A / (__v8hf) __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_div_ph (__m256h __A, __m256h __B)
+{
+  return (__m256h) ((__v16hf) __A / (__v16hf) __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_div_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return __builtin_ia32_divph128_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_div_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
+{
+  return __builtin_ia32_divph256_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_div_ph (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return __builtin_ia32_divph128_mask (__B, __C, _mm_setzero_ph (),
+                                      __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_div_ph (__mmask16 __A, __m256h __B, __m256h __C)
+{
+  return __builtin_ia32_divph256_mask (__B, __C,
+                                      _mm256_setzero_ph (), __A);
+}
+
+/* Intrinsics v[max,min]ph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_ph (__m128h __A, __m128h __B)
+{
+  return __builtin_ia32_maxph128_mask (__A, __B,
+                                      _mm_setzero_ph (),
+                                      (__mmask8) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_ph (__m256h __A, __m256h __B)
+{
+  return __builtin_ia32_maxph256_mask (__A, __B,
+                                      _mm256_setzero_ph (),
+                                      (__mmask16) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return __builtin_ia32_maxph128_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_max_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
+{
+  return __builtin_ia32_maxph256_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_ph (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return __builtin_ia32_maxph128_mask (__B, __C, _mm_setzero_ph (),
+                                      __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_max_ph (__mmask16 __A, __m256h __B, __m256h __C)
+{
+  return __builtin_ia32_maxph256_mask (__B, __C,
+                                      _mm256_setzero_ph (), __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_ph (__m128h __A, __m128h __B)
+{
+  return __builtin_ia32_minph128_mask (__A, __B,
+                                      _mm_setzero_ph (),
+                                      (__mmask8) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_ph (__m256h __A, __m256h __B)
+{
+  return __builtin_ia32_minph256_mask (__A, __B,
+                                      _mm256_setzero_ph (),
+                                      (__mmask16) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return __builtin_ia32_minph128_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_min_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D)
+{
+  return __builtin_ia32_minph256_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_ph (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return __builtin_ia32_minph128_mask (__B, __C, _mm_setzero_ph (),
+                                      __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_min_ph (__mmask16 __A, __m256h __B, __m256h __C)
+{
+  return __builtin_ia32_minph256_mask (__B, __C,
+                                      _mm256_setzero_ph (), __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_ph (__m128h __A)
+{
+  return (__m128h) _mm_and_si128 ( _mm_set1_epi32 (0x7FFF7FFF),
+                                  (__m128i) __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_abs_ph (__m256h __A)
+{
+  return (__m256h) _mm256_and_si256 ( _mm256_set1_epi32 (0x7FFF7FFF),
+                                     (__m256i) __A);
+}
+
+/* vcmpph */
+#ifdef __OPTIMIZE
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_ph_mask (__m128h __A, __m128h __B, const int __C)
+{
+  return (__mmask8) __builtin_ia32_cmpph128_mask (__A, __B, __C,
+                                                 (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_ph_mask (__mmask8 __A, __m128h __B, __m128h __C,
+                     const int __D)
+{
+  return (__mmask8) __builtin_ia32_cmpph128_mask (__B, __C, __D, __A);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_ph_mask (__m256h __A, __m256h __B, const int __C)
+{
+  return (__mmask16) __builtin_ia32_cmpph256_mask (__A, __B, __C,
+                                                  (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_ph_mask (__mmask16 __A, __m256h __B, __m256h __C,
+                     const int __D)
+{
+  return (__mmask16) __builtin_ia32_cmpph256_mask (__B, __C, __D,
+                                                  __A);
+}
+
+#else
+#define _mm_cmp_ph_mask(A, B, C)                       \
+  (__builtin_ia32_cmpph128_mask ((A), (B), (C), (-1)))
+
+#define _mm_mask_cmp_ph_mask(A, B, C, D)               \
+  (__builtin_ia32_cmpph128_mask ((B), (C), (D), (A)))
+
+#define _mm256_cmp_ph_mask(A, B, C)                    \
+  (__builtin_ia32_cmpph256_mask ((A), (B), (C), (-1)))
+
+#define _mm256_mask_cmp_ph_mask(A, B, C, D)            \
+  (__builtin_ia32_cmpph256_mask ((B), (C), (D), (A)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vsqrtph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_ph (__m128h __A)
+{
+  return __builtin_ia32_sqrtph128_mask (__A, _mm_setzero_ph (),
+                                       (__mmask8) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sqrt_ph (__m256h __A)
+{
+  return __builtin_ia32_sqrtph256_mask (__A, _mm256_setzero_ph (),
+                                       (__mmask16) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sqrt_ph (__m128h __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_sqrtph128_mask (__C, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sqrt_ph (__m256h __A, __mmask16 __B, __m256h __C)
+{
+  return __builtin_ia32_sqrtph256_mask (__C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sqrt_ph (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_sqrtph128_mask (__B, _mm_setzero_ph (),
+                                       __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sqrt_ph (__mmask16 __A, __m256h __B)
+{
+  return __builtin_ia32_sqrtph256_mask (__B, _mm256_setzero_ph (),
+                                       __A);
+}
+
+/* Intrinsics vrsqrtph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt_ph (__m128h __A)
+{
+  return __builtin_ia32_rsqrtph128_mask (__A, _mm_setzero_ph (),
+                                        (__mmask8) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rsqrt_ph (__m256h __A)
+{
+  return __builtin_ia32_rsqrtph256_mask (__A, _mm256_setzero_ph (),
+                                        (__mmask16) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rsqrt_ph (__m128h __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_rsqrtph128_mask (__C, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_rsqrt_ph (__m256h __A, __mmask16 __B, __m256h __C)
+{
+  return __builtin_ia32_rsqrtph256_mask (__C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rsqrt_ph (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_rsqrtph128_mask (__B, _mm_setzero_ph (), __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_rsqrt_ph (__mmask16 __A, __m256h __B)
+{
+  return __builtin_ia32_rsqrtph256_mask (__B, _mm256_setzero_ph (),
+                                        __A);
+}
+
+/* Intrinsics vrcpph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp_ph (__m128h __A)
+{
+  return __builtin_ia32_rcpph128_mask (__A, _mm_setzero_ph (),
+                                      (__mmask8) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rcp_ph (__m256h __A)
+{
+  return __builtin_ia32_rcpph256_mask (__A, _mm256_setzero_ph (),
+                                      (__mmask16) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rcp_ph (__m128h __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_rcpph128_mask (__C, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_rcp_ph (__m256h __A, __mmask16 __B, __m256h __C)
+{
+  return __builtin_ia32_rcpph256_mask (__C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rcp_ph (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_rcpph128_mask (__B, _mm_setzero_ph (), __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_rcp_ph (__mmask16 __A, __m256h __B)
+{
+  return __builtin_ia32_rcpph256_mask (__B, _mm256_setzero_ph (),
+                                      __A);
+}
+
+/* Intrinsics vscalefph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_scalef_ph (__m128h __A, __m128h __B)
+{
+  return __builtin_ia32_scalefph128_mask (__A, __B,
+                                         _mm_setzero_ph (),
+                                         (__mmask8) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_scalef_ph (__m256h __A, __m256h __B)
+{
+  return __builtin_ia32_scalefph256_mask (__A, __B,
+                                         _mm256_setzero_ph (),
+                                         (__mmask16) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_scalef_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return __builtin_ia32_scalefph128_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_scalef_ph (__m256h __A, __mmask16 __B, __m256h __C,
+                      __m256h __D)
+{
+  return __builtin_ia32_scalefph256_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_scalef_ph (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return __builtin_ia32_scalefph128_mask (__B, __C,
+                                         _mm_setzero_ph (), __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_scalef_ph (__mmask16 __A, __m256h __B, __m256h __C)
+{
+  return __builtin_ia32_scalefph256_mask (__B, __C,
+                                         _mm256_setzero_ph (),
+                                         __A);
+}
+
+/* Intrinsics vreduceph.  */
+#ifdef __OPTIMIZE__
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_ph (__m128h __A, int __B)
+{
+  return __builtin_ia32_reduceph128_mask (__A, __B,
+                                         _mm_setzero_ph (),
+                                         (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_ph (__m128h __A, __mmask8 __B, __m128h __C, int __D)
+{
+  return __builtin_ia32_reduceph128_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_reduce_ph (__mmask8 __A, __m128h __B, int __C)
+{
+  return __builtin_ia32_reduceph128_mask (__B, __C,
+                                         _mm_setzero_ph (), __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_ph (__m256h __A, int __B)
+{
+  return __builtin_ia32_reduceph256_mask (__A, __B,
+                                         _mm256_setzero_ph (),
+                                         (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_reduce_ph (__m256h __A, __mmask16 __B, __m256h __C, int __D)
+{
+  return __builtin_ia32_reduceph256_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_reduce_ph (__mmask16 __A, __m256h __B, int __C)
+{
+  return __builtin_ia32_reduceph256_mask (__B, __C,
+                                         _mm256_setzero_ph (),
+                                         __A);
+}
+
+#else
+#define _mm_reduce_ph(A, B)                            \
+  (__builtin_ia32_reduceph128_mask ((A), (B),          \
+                                   _mm_setzero_ph (),  \
+                                   ((__mmask8)-1)))
+
+#define _mm_mask_reduce_ph(A,  B,  C, D)                       \
+  (__builtin_ia32_reduceph128_mask ((C), (D), (A), (B)))
+
+#define _mm_maskz_reduce_ph(A,  B, C)                                  \
+  (__builtin_ia32_reduceph128_mask ((B), (C), _mm_setzero_ph (), (A)))
+
+#define _mm256_reduce_ph(A, B)                                 \
+  (__builtin_ia32_reduceph256_mask ((A), (B),                  \
+                                   _mm256_setzero_ph (),       \
+                                   ((__mmask16)-1)))
+
+#define _mm256_mask_reduce_ph(A, B, C, D)                      \
+  (__builtin_ia32_reduceph256_mask ((C), (D), (A), (B)))
+
+#define _mm256_maskz_reduce_ph(A, B, C)                                        \
+  (__builtin_ia32_reduceph256_mask ((B), (C), _mm256_setzero_ph (), (A)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vrndscaleph.  */
+#ifdef __OPTIMIZE__
+  extern __inline __m128h
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+  _mm_roundscale_ph (__m128h __A, int __B)
+  {
+    return __builtin_ia32_rndscaleph128_mask (__A, __B,
+                                             _mm_setzero_ph (),
+                                             (__mmask8) -1);
+  }
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_roundscale_ph (__m128h __A, __mmask8 __B, __m128h __C, int __D)
+{
+  return __builtin_ia32_rndscaleph128_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_roundscale_ph (__mmask8 __A, __m128h __B, int __C)
+{
+  return __builtin_ia32_rndscaleph128_mask (__B, __C,
+                                           _mm_setzero_ph (), __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_roundscale_ph (__m256h __A, int __B)
+{
+  return __builtin_ia32_rndscaleph256_mask (__A, __B,
+                                           _mm256_setzero_ph (),
+                                           (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_roundscale_ph (__m256h __A, __mmask16 __B, __m256h __C,
+                          int __D)
+{
+  return __builtin_ia32_rndscaleph256_mask (__C, __D, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_roundscale_ph (__mmask16 __A, __m256h __B, int __C)
+{
+  return __builtin_ia32_rndscaleph256_mask (__B, __C,
+                                           _mm256_setzero_ph (),
+                                           __A);
+}
+
+#else
+#define _mm_roundscale_ph(A, B)                                                \
+  (__builtin_ia32_rndscaleph128_mask ((A), (B), _mm_setzero_ph (),     \
+                                     ((__mmask8)-1)))
+
+#define _mm_mask_roundscale_ph(A, B, C, D)                     \
+  (__builtin_ia32_rndscaleph128_mask ((C), (D), (A), (B)))
+
+#define _mm_maskz_roundscale_ph(A, B, C)                               \
+  (__builtin_ia32_rndscaleph128_mask ((B), (C), _mm_setzero_ph (), (A)))
+
+#define _mm256_roundscale_ph(A, B)                             \
+  (__builtin_ia32_rndscaleph256_mask ((A), (B),                        \
+                                     _mm256_setzero_ph(),      \
+                                     ((__mmask16)-1)))
+
+#define _mm256_mask_roundscale_ph(A, B, C, D)                  \
+  (__builtin_ia32_rndscaleph256_mask ((C), (D), (A), (B)))
+
+#define _mm256_maskz_roundscale_ph(A, B, C)                            \
+  (__builtin_ia32_rndscaleph256_mask ((B), (C),                                \
+                                     _mm256_setzero_ph (), (A)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vfpclassph.  */
+#ifdef __OPTIMIZE__
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+  _mm_mask_fpclass_ph_mask (__mmask8 __U, __m128h __A, const int __imm)
+{
+  return (__mmask8) __builtin_ia32_fpclassph128_mask ((__v8hf) __A,
+                                                     __imm, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fpclass_ph_mask (__m128h __A, const int __imm)
+{
+  return (__mmask8) __builtin_ia32_fpclassph128_mask ((__v8hf) __A,
+                                                     __imm,
+                                                     (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fpclass_ph_mask (__mmask16 __U, __m256h __A, const int __imm)
+{
+  return (__mmask16) __builtin_ia32_fpclassph256_mask ((__v16hf) __A,
+                                                      __imm, __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fpclass_ph_mask (__m256h __A, const int __imm)
+{
+  return (__mmask16) __builtin_ia32_fpclassph256_mask ((__v16hf) __A,
+                                                      __imm,
+                                                      (__mmask16) -1);
+}
+
+#else
+#define _mm_fpclass_ph_mask(X, C)                                       \
+  ((__mmask8) __builtin_ia32_fpclassph128_mask ((__v8hf) (__m128h) (X),        \
+                                               (int) (C),(__mmask8)-1))
+
+#define _mm_mask_fpclass_ph_mask(u, X, C)                               \
+  ((__mmask8) __builtin_ia32_fpclassph128_mask ((__v8hf) (__m128h) (X),        \
+                                               (int) (C),(__mmask8)(u)))
+
+#define _mm256_fpclass_ph_mask(X, C)                                    \
+  ((__mmask16) __builtin_ia32_fpclassph256_mask ((__v16hf) (__m256h) (X), \
+                                                (int) (C),(__mmask16)-1))
+
+#define _mm256_mask_fpclass_ph_mask(u, X, C)                           \
+  ((__mmask16) __builtin_ia32_fpclassph256_mask ((__v16hf) (__m256h) (X), \
+                                                (int) (C),(__mmask16)(u)))
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vgetexpph, vgetexpsh.  */
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_getexp_ph (__m256h __A)
+{
+  return (__m256h) __builtin_ia32_getexpph256_mask ((__v16hf) __A,
+                                                   (__v16hf)
+                                                   _mm256_setzero_ph (),
+                                                   (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_getexp_ph (__m256h __W, __mmask16 __U, __m256h __A)
+{
+  return (__m256h) __builtin_ia32_getexpph256_mask ((__v16hf) __A,
+                                                   (__v16hf) __W,
+                                                   (__mmask16) __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_getexp_ph (__mmask16 __U, __m256h __A)
+{
+  return (__m256h) __builtin_ia32_getexpph256_mask ((__v16hf) __A,
+                                                   (__v16hf)
+                                                   _mm256_setzero_ph (),
+                                                   (__mmask16) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getexp_ph (__m128h __A)
+{
+  return (__m128h) __builtin_ia32_getexpph128_mask ((__v8hf) __A,
+                                                   (__v8hf)
+                                                   _mm_setzero_ph (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getexp_ph (__m128h __W, __mmask8 __U, __m128h __A)
+{
+  return (__m128h) __builtin_ia32_getexpph128_mask ((__v8hf) __A,
+                                                   (__v8hf) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getexp_ph (__mmask8 __U, __m128h __A)
+{
+  return (__m128h) __builtin_ia32_getexpph128_mask ((__v8hf) __A,
+                                                   (__v8hf)
+                                                   _mm_setzero_ph (),
+                                                   (__mmask8) __U);
+}
+
+
+/* Intrinsics vgetmantph, vgetmantsh.  */
+#ifdef __OPTIMIZE__
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_getmant_ph (__m256h __A, _MM_MANTISSA_NORM_ENUM __B,
+                  _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m256h) __builtin_ia32_getmantph256_mask ((__v16hf) __A,
+                                                    (__C << 2) | __B,
+                                                    (__v16hf)
+                                                    _mm256_setzero_ph (),
+                                                    (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_getmant_ph (__m256h __W, __mmask16 __U, __m256h __A,
+                       _MM_MANTISSA_NORM_ENUM __B,
+                       _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m256h) __builtin_ia32_getmantph256_mask ((__v16hf) __A,
+                                                    (__C << 2) | __B,
+                                                    (__v16hf) __W,
+                                                    (__mmask16) __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_getmant_ph (__mmask16 __U, __m256h __A,
+                        _MM_MANTISSA_NORM_ENUM __B,
+                        _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m256h) __builtin_ia32_getmantph256_mask ((__v16hf) __A,
+                                                    (__C << 2) | __B,
+                                                    (__v16hf)
+                                                    _mm256_setzero_ph (),
+                                                    (__mmask16) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getmant_ph (__m128h __A, _MM_MANTISSA_NORM_ENUM __B,
+               _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m128h) __builtin_ia32_getmantph128_mask ((__v8hf) __A,
+                                                    (__C << 2) | __B,
+                                                    (__v8hf)
+                                                    _mm_setzero_ph (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getmant_ph (__m128h __W, __mmask8 __U, __m128h __A,
+                    _MM_MANTISSA_NORM_ENUM __B,
+                    _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m128h) __builtin_ia32_getmantph128_mask ((__v8hf) __A,
+                                                    (__C << 2) | __B,
+                                                    (__v8hf) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getmant_ph (__mmask8 __U, __m128h __A,
+                     _MM_MANTISSA_NORM_ENUM __B,
+                     _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m128h) __builtin_ia32_getmantph128_mask ((__v8hf) __A,
+                                                    (__C << 2) | __B,
+                                                    (__v8hf)
+                                                    _mm_setzero_ph (),
+                                                    (__mmask8) __U);
+}
+
+#else
+#define _mm256_getmant_ph(X, B, C)                                     \
+  ((__m256h) __builtin_ia32_getmantph256_mask ((__v16hf)(__m256h) (X), \
+                                              (int)(((C)<<2) | (B)),   \
+                                              (__v16hf)(__m256h)_mm256_setzero_ph (), \
+                                              (__mmask16)-1))
+
+#define _mm256_mask_getmant_ph(W, U, X, B, C)                          \
+  ((__m256h) __builtin_ia32_getmantph256_mask ((__v16hf)(__m256h) (X), \
+                                              (int)(((C)<<2) | (B)),   \
+                                              (__v16hf)(__m256h)(W),   \
+                                              (__mmask16)(U)))
+
+#define _mm256_maskz_getmant_ph(U, X, B, C)                            \
+  ((__m256h) __builtin_ia32_getmantph256_mask ((__v16hf)(__m256h) (X), \
+                                              (int)(((C)<<2) | (B)),   \
+                                              (__v16hf)(__m256h)_mm256_setzero_ph (), \
+                                              (__mmask16)(U)))
+
+#define _mm_getmant_ph(X, B, C)                                                \
+  ((__m128h) __builtin_ia32_getmantph128_mask ((__v8hf)(__m128h) (X),  \
+                                              (int)(((C)<<2) | (B)),   \
+                                              (__v8hf)(__m128h)_mm_setzero_ph (), \
+                                              (__mmask8)-1))
+
+#define _mm_mask_getmant_ph(W, U, X, B, C)                             \
+  ((__m128h) __builtin_ia32_getmantph128_mask ((__v8hf)(__m128h) (X),  \
+                                              (int)(((C)<<2) | (B)),   \
+                                              (__v8hf)(__m128h)(W),    \
+                                              (__mmask8)(U)))
+
+#define _mm_maskz_getmant_ph(U, X, B, C)                               \
+  ((__m128h) __builtin_ia32_getmantph128_mask ((__v8hf)(__m128h) (X),  \
+                                              (int)(((C)<<2) | (B)),   \
+                                              (__v8hf)(__m128h)_mm_setzero_ph (), \
+                                              (__mmask8)(U)))
+
+#endif /* __OPTIMIZE__ */
+
+/* Intrinsics vcvtph2dq.  */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtph_epi32 (__m128h __A)
+{
+  return (__m128i)
+    __builtin_ia32_vcvtph2dq128_mask (__A,
+                                     (__v4si)
+                                     _mm_setzero_si128 (),
+                                     (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtph_epi32 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+  return (__m128i)
+    __builtin_ia32_vcvtph2dq128_mask (__C, ( __v4si) __A, __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtph_epi32 (__mmask8 __A, __m128h __B)
+{
+  return (__m128i)
+    __builtin_ia32_vcvtph2dq128_mask (__B,
+                                     (__v4si) _mm_setzero_si128 (),
+                                     __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtph_epi32 (__m128h __A)
+{
+  return (__m256i)
+    __builtin_ia32_vcvtph2dq256_mask (__A,
+                                     (__v8si)
+                                     _mm256_setzero_si256 (),
+                                     (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtph_epi32 (__m256i __A, __mmask8 __B, __m128h __C)
+{
+  return (__m256i)
+    __builtin_ia32_vcvtph2dq256_mask (__C, ( __v8si) __A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtph_epi32 (__mmask8 __A, __m128h __B)
+{
+  return (__m256i)
+    __builtin_ia32_vcvtph2dq256_mask (__B,
+                                     (__v8si)
+                                     _mm256_setzero_si256 (),
+                                     __A);
+}
+
+/* Intrinsics vcvtph2udq.  */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtph_epu32 (__m128h __A)
+{
+  return (__m128i)
+    __builtin_ia32_vcvtph2udq128_mask (__A,
+                                      (__v4si)
+                                      _mm_setzero_si128 (),
+                                      (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtph_epu32 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+  return (__m128i)
+    __builtin_ia32_vcvtph2udq128_mask (__C, ( __v4si) __A, __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtph_epu32 (__mmask8 __A, __m128h __B)
+{
+  return (__m128i)
+    __builtin_ia32_vcvtph2udq128_mask (__B,
+                                      (__v4si)
+                                      _mm_setzero_si128 (),
+                                      __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtph_epu32 (__m128h __A)
+{
+  return (__m256i)
+    __builtin_ia32_vcvtph2udq256_mask (__A,
+                                      (__v8si)
+                                      _mm256_setzero_si256 (),
+                                      (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtph_epu32 (__m256i __A, __mmask8 __B, __m128h __C)
+{
+  return (__m256i)
+    __builtin_ia32_vcvtph2udq256_mask (__C, ( __v8si) __A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtph_epu32 (__mmask8 __A, __m128h __B)
+{
+  return (__m256i)
+    __builtin_ia32_vcvtph2udq256_mask (__B,
+                                      (__v8si) _mm256_setzero_si256 (),
+                                      __A);
+}
+
+/* Intrinsics vcvttph2dq.  */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttph_epi32 (__m128h __A)
+{
+  return (__m128i)
+    __builtin_ia32_vcvttph2dq128_mask (__A,
+                                      (__v4si) _mm_setzero_si128 (),
+                                      (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttph_epi32 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+  return (__m128i)__builtin_ia32_vcvttph2dq128_mask (__C,
+                                                    ( __v4si) __A,
+                                                    __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttph_epi32 (__mmask8 __A, __m128h __B)
+{
+  return (__m128i)
+    __builtin_ia32_vcvttph2dq128_mask (__B,
+                                      (__v4si) _mm_setzero_si128 (),
+                                      __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttph_epi32 (__m128h __A)
+{
+  return (__m256i)
+    __builtin_ia32_vcvttph2dq256_mask (__A,
+                                      (__v8si)
+                                      _mm256_setzero_si256 (),
+                                      (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttph_epi32 (__m256i __A, __mmask8 __B, __m128h __C)
+{
+  return (__m256i)
+    __builtin_ia32_vcvttph2dq256_mask (__C,
+                                      ( __v8si) __A,
+                                      __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttph_epi32 (__mmask8 __A, __m128h __B)
+{
+  return (__m256i)
+    __builtin_ia32_vcvttph2dq256_mask (__B,
+                                      (__v8si)
+                                      _mm256_setzero_si256 (),
+                                      __A);
+}
+
+/* Intrinsics vcvttph2udq.  */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttph_epu32 (__m128h __A)
+{
+  return (__m128i)
+    __builtin_ia32_vcvttph2udq128_mask (__A,
+                                       (__v4si)
+                                       _mm_setzero_si128 (),
+                                       (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttph_epu32 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+  return (__m128i)
+    __builtin_ia32_vcvttph2udq128_mask (__C,
+                                       ( __v4si) __A,
+                                       __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttph_epu32 (__mmask8 __A, __m128h __B)
+{
+  return (__m128i)
+    __builtin_ia32_vcvttph2udq128_mask (__B,
+                                       (__v4si)
+                                       _mm_setzero_si128 (),
+                                       __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttph_epu32 (__m128h __A)
+{
+  return (__m256i)
+    __builtin_ia32_vcvttph2udq256_mask (__A,
+                                       (__v8si)
+                                       _mm256_setzero_si256 (), (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttph_epu32 (__m256i __A, __mmask8 __B, __m128h __C)
+{
+  return (__m256i)
+    __builtin_ia32_vcvttph2udq256_mask (__C,
+                                       ( __v8si) __A,
+                                       __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttph_epu32 (__mmask8 __A, __m128h __B)
+{
+  return (__m256i)
+    __builtin_ia32_vcvttph2udq256_mask (__B,
+                                       (__v8si)
+                                       _mm256_setzero_si256 (),
+                                       __A);
+}
+
+/* Intrinsics vcvtdq2ph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi32_ph (__m128i __A)
+{
+  return __builtin_ia32_vcvtdq2ph128_mask ((__v4si) __A,
+                                          _mm_setzero_ph (),
+                                          (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi32_ph (__m128h __A, __mmask8 __B, __m128i __C)
+{
+  return __builtin_ia32_vcvtdq2ph128_mask ((__v4si) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi32_ph (__mmask8 __A, __m128i __B)
+{
+  return __builtin_ia32_vcvtdq2ph128_mask ((__v4si) __B,
+                                          _mm_setzero_ph (),
+                                          __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi32_ph (__m256i __A)
+{
+  return __builtin_ia32_vcvtdq2ph256_mask ((__v8si) __A,
+                                          _mm_setzero_ph (),
+                                          (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi32_ph (__m128h __A, __mmask8 __B, __m256i __C)
+{
+  return __builtin_ia32_vcvtdq2ph256_mask ((__v8si) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi32_ph (__mmask8 __A, __m256i __B)
+{
+  return __builtin_ia32_vcvtdq2ph256_mask ((__v8si) __B,
+                                          _mm_setzero_ph (),
+                                          __A);
+}
+
+/* Intrinsics vcvtudq2ph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu32_ph (__m128i __A)
+{
+  return __builtin_ia32_vcvtudq2ph128_mask ((__v4si) __A,
+                                           _mm_setzero_ph (),
+                                           (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepu32_ph (__m128h __A, __mmask8 __B, __m128i __C)
+{
+  return __builtin_ia32_vcvtudq2ph128_mask ((__v4si) __C,
+                                           __A,
+                                           __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepu32_ph (__mmask8 __A, __m128i __B)
+{
+  return __builtin_ia32_vcvtudq2ph128_mask ((__v4si) __B,
+                                           _mm_setzero_ph (),
+                                           __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu32_ph (__m256i __A)
+{
+  return __builtin_ia32_vcvtudq2ph256_mask ((__v8si) __A,
+                                           _mm_setzero_ph (),
+                                           (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepu32_ph (__m128h __A, __mmask8 __B, __m256i __C)
+{
+  return __builtin_ia32_vcvtudq2ph256_mask ((__v8si) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepu32_ph (__mmask8 __A, __m256i __B)
+{
+  return __builtin_ia32_vcvtudq2ph256_mask ((__v8si) __B,
+                                           _mm_setzero_ph (),
+                                           __A);
+}
+
+/* Intrinsics vcvtph2qq.  */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtph_epi64 (__m128h __A)
+{
+  return
+    __builtin_ia32_vcvtph2qq128_mask (__A,
+                                     _mm_setzero_si128 (),
+                                     (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtph_epi64 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvtph2qq128_mask (__C, __A, __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtph_epi64 (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvtph2qq128_mask (__B,
+                                          _mm_setzero_si128 (),
+                                          __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtph_epi64 (__m128h __A)
+{
+  return __builtin_ia32_vcvtph2qq256_mask (__A,
+                                          _mm256_setzero_si256 (),
+                                          (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtph_epi64 (__m256i __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvtph2qq256_mask (__C, __A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtph_epi64 (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvtph2qq256_mask (__B,
+                                          _mm256_setzero_si256 (),
+                                          __A);
+}
+
+/* Intrinsics vcvtph2uqq.  */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtph_epu64 (__m128h __A)
+{
+  return __builtin_ia32_vcvtph2uqq128_mask (__A,
+                                           _mm_setzero_si128 (),
+                                           (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtph_epu64 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvtph2uqq128_mask (__C, __A, __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtph_epu64 (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvtph2uqq128_mask (__B,
+                                           _mm_setzero_si128 (),
+                                           __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtph_epu64 (__m128h __A)
+{
+  return __builtin_ia32_vcvtph2uqq256_mask (__A,
+                                           _mm256_setzero_si256 (),
+                                           (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtph_epu64 (__m256i __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvtph2uqq256_mask (__C, __A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtph_epu64 (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvtph2uqq256_mask (__B,
+                                           _mm256_setzero_si256 (),
+                                           __A);
+}
+
+/* Intrinsics vcvttph2qq.  */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttph_epi64 (__m128h __A)
+{
+  return __builtin_ia32_vcvttph2qq128_mask (__A,
+                                           _mm_setzero_si128 (),
+                                           (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttph_epi64 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvttph2qq128_mask (__C,
+                                           __A,
+                                           __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttph_epi64 (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvttph2qq128_mask (__B,
+                                           _mm_setzero_si128 (),
+                                           __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttph_epi64 (__m128h __A)
+{
+  return __builtin_ia32_vcvttph2qq256_mask (__A,
+                                           _mm256_setzero_si256 (),
+                                           (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttph_epi64 (__m256i __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvttph2qq256_mask (__C,
+                                           __A,
+                                           __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttph_epi64 (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvttph2qq256_mask (__B,
+                                           _mm256_setzero_si256 (),
+                                           __A);
+}
+
+/* Intrinsics vcvttph2uqq.  */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttph_epu64 (__m128h __A)
+{
+  return __builtin_ia32_vcvttph2uqq128_mask (__A,
+                                            _mm_setzero_si128 (),
+                                            (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttph_epu64 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvttph2uqq128_mask (__C,
+                                            __A,
+                                            __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttph_epu64 (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvttph2uqq128_mask (__B,
+                                            _mm_setzero_si128 (),
+                                            __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttph_epu64 (__m128h __A)
+{
+  return __builtin_ia32_vcvttph2uqq256_mask (__A,
+                                            _mm256_setzero_si256 (),
+                                            (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttph_epu64 (__m256i __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvttph2uqq256_mask (__C,
+                                            __A,
+                                            __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttph_epu64 (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvttph2uqq256_mask (__B,
+                                            _mm256_setzero_si256 (),
+                                            __A);
+}
+
+/* Intrinsics vcvtqq2ph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi64_ph (__m128i __A)
+{
+  return __builtin_ia32_vcvtqq2ph128_mask ((__v2di) __A,
+                                          _mm_setzero_ph (),
+                                          (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi64_ph (__m128h __A, __mmask8 __B, __m128i __C)
+{
+  return __builtin_ia32_vcvtqq2ph128_mask ((__v2di) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi64_ph (__mmask8 __A, __m128i __B)
+{
+  return __builtin_ia32_vcvtqq2ph128_mask ((__v2di) __B,
+                                          _mm_setzero_ph (),
+                                          __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi64_ph (__m256i __A)
+{
+  return __builtin_ia32_vcvtqq2ph256_mask ((__v4di) __A,
+                                          _mm_setzero_ph (),
+                                          (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi64_ph (__m128h __A, __mmask8 __B, __m256i __C)
+{
+  return __builtin_ia32_vcvtqq2ph256_mask ((__v4di) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi64_ph (__mmask8 __A, __m256i __B)
+{
+  return __builtin_ia32_vcvtqq2ph256_mask ((__v4di) __B,
+                                          _mm_setzero_ph (),
+                                          __A);
+}
+
+/* Intrinsics vcvtuqq2ph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu64_ph (__m128i __A)
+{
+  return __builtin_ia32_vcvtuqq2ph128_mask ((__v2di) __A,
+                                           _mm_setzero_ph (),
+                                           (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepu64_ph (__m128h __A, __mmask8 __B, __m128i __C)
+{
+  return __builtin_ia32_vcvtuqq2ph128_mask ((__v2di) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepu64_ph (__mmask8 __A, __m128i __B)
+{
+  return __builtin_ia32_vcvtuqq2ph128_mask ((__v2di) __B,
+                                           _mm_setzero_ph (),
+                                           __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu64_ph (__m256i __A)
+{
+  return __builtin_ia32_vcvtuqq2ph256_mask ((__v4di) __A,
+                                           _mm_setzero_ph (),
+                                           (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepu64_ph (__m128h __A, __mmask8 __B, __m256i __C)
+{
+  return __builtin_ia32_vcvtuqq2ph256_mask ((__v4di) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepu64_ph (__mmask8 __A, __m256i __B)
+{
+  return __builtin_ia32_vcvtuqq2ph256_mask ((__v4di) __B,
+                                           _mm_setzero_ph (),
+                                           __A);
+}
+
+/* Intrinsics vcvtph2w.  */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtph_epi16 (__m128h __A)
+{
+  return (__m128i)
+    __builtin_ia32_vcvtph2w128_mask (__A,
+                                    (__v8hi)
+                                    _mm_setzero_si128 (),
+                                    (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtph_epi16 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+  return (__m128i)
+    __builtin_ia32_vcvtph2w128_mask (__C, ( __v8hi) __A, __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtph_epi16 (__mmask8 __A, __m128h __B)
+{
+  return (__m128i)
+    __builtin_ia32_vcvtph2w128_mask (__B,
+                                    (__v8hi)
+                                    _mm_setzero_si128 (),
+                                    __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtph_epi16 (__m256h __A)
+{
+  return (__m256i)
+    __builtin_ia32_vcvtph2w256_mask (__A,
+                                    (__v16hi)
+                                    _mm256_setzero_si256 (),
+                                    (__mmask16) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtph_epi16 (__m256i __A, __mmask16 __B, __m256h __C)
+{
+  return (__m256i)
+    __builtin_ia32_vcvtph2w256_mask (__C, ( __v16hi) __A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtph_epi16 (__mmask16 __A, __m256h __B)
+{
+  return (__m256i)
+    __builtin_ia32_vcvtph2w256_mask (__B,
+                                    (__v16hi)
+                                    _mm256_setzero_si256 (),
+                                    __A);
+}
+
+/* Intrinsics vcvtph2uw.  */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtph_epu16 (__m128h __A)
+{
+  return (__m128i)
+    __builtin_ia32_vcvtph2uw128_mask (__A,
+                                     (__v8hi)
+                                     _mm_setzero_si128 (),
+                                     (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtph_epu16 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+  return (__m128i)
+    __builtin_ia32_vcvtph2uw128_mask (__C, ( __v8hi) __A, __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtph_epu16 (__mmask8 __A, __m128h __B)
+{
+  return (__m128i)
+    __builtin_ia32_vcvtph2uw128_mask (__B,
+                                     (__v8hi)
+                                     _mm_setzero_si128 (),
+                                     __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtph_epu16 (__m256h __A)
+{
+  return (__m256i)
+    __builtin_ia32_vcvtph2uw256_mask (__A,
+                                     (__v16hi)
+                                     _mm256_setzero_si256 (),
+                                     (__mmask16) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtph_epu16 (__m256i __A, __mmask16 __B, __m256h __C)
+{
+  return (__m256i)
+    __builtin_ia32_vcvtph2uw256_mask (__C, ( __v16hi) __A, __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtph_epu16 (__mmask16 __A, __m256h __B)
+{
+  return (__m256i)
+    __builtin_ia32_vcvtph2uw256_mask (__B,
+                                     (__v16hi)
+                                     _mm256_setzero_si256 (),
+                                     __A);
+}
+
+/* Intrinsics vcvttph2w.  */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttph_epi16 (__m128h __A)
+{
+  return (__m128i)
+    __builtin_ia32_vcvttph2w128_mask (__A,
+                                     (__v8hi)
+                                     _mm_setzero_si128 (),
+                                     (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttph_epi16 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+  return (__m128i)
+    __builtin_ia32_vcvttph2w128_mask (__C,
+                                     ( __v8hi) __A,
+                                     __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttph_epi16 (__mmask8 __A, __m128h __B)
+{
+  return (__m128i)
+    __builtin_ia32_vcvttph2w128_mask (__B,
+                                     (__v8hi)
+                                     _mm_setzero_si128 (),
+                                     __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttph_epi16 (__m256h __A)
+{
+  return (__m256i)
+    __builtin_ia32_vcvttph2w256_mask (__A,
+                                     (__v16hi)
+                                     _mm256_setzero_si256 (),
+                                     (__mmask16) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttph_epi16 (__m256i __A, __mmask16 __B, __m256h __C)
+{
+  return (__m256i)
+    __builtin_ia32_vcvttph2w256_mask (__C,
+                                     ( __v16hi) __A,
+                                     __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttph_epi16 (__mmask16 __A, __m256h __B)
+{
+  return (__m256i)
+    __builtin_ia32_vcvttph2w256_mask (__B,
+                                     (__v16hi)
+                                     _mm256_setzero_si256 (),
+                                     __A);
+}
+
+/* Intrinsics vcvttph2uw.  */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttph_epu16 (__m128h __A)
+{
+  return (__m128i)
+    __builtin_ia32_vcvttph2uw128_mask (__A,
+                                      (__v8hi)
+                                      _mm_setzero_si128 (),
+                                      (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttph_epu16 (__m128i __A, __mmask8 __B, __m128h __C)
+{
+  return (__m128i)
+    __builtin_ia32_vcvttph2uw128_mask (__C,
+                                      ( __v8hi) __A,
+                                      __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttph_epu16 (__mmask8 __A, __m128h __B)
+{
+  return (__m128i)
+    __builtin_ia32_vcvttph2uw128_mask (__B,
+                                      (__v8hi)
+                                      _mm_setzero_si128 (),
+                                      __A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttph_epu16 (__m256h __A)
+{
+  return (__m256i)
+    __builtin_ia32_vcvttph2uw256_mask (__A,
+                                      (__v16hi)
+                                      _mm256_setzero_si256 (),
+                                      (__mmask16) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttph_epu16 (__m256i __A, __mmask16 __B, __m256h __C)
+{
+  return (__m256i)
+    __builtin_ia32_vcvttph2uw256_mask (__C,
+                                      ( __v16hi) __A,
+                                      __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttph_epu16 (__mmask16 __A, __m256h __B)
+{
+  return (__m256i)
+    __builtin_ia32_vcvttph2uw256_mask (__B,
+                                      (__v16hi) _mm256_setzero_si256 (),
+                                      __A);
+}
+
+/* Intrinsics vcvtw2ph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi16_ph (__m128i __A)
+{
+  return __builtin_ia32_vcvtw2ph128_mask ((__v8hi) __A,
+                                         _mm_setzero_ph (),
+                                         (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi16_ph (__m128h __A, __mmask8 __B, __m128i __C)
+{
+  return __builtin_ia32_vcvtw2ph128_mask ((__v8hi) __C,
+                                         __A,
+                                         __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi16_ph (__mmask8 __A, __m128i __B)
+{
+  return __builtin_ia32_vcvtw2ph128_mask ((__v8hi) __B,
+                                         _mm_setzero_ph (),
+                                         __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi16_ph (__m256i __A)
+{
+  return __builtin_ia32_vcvtw2ph256_mask ((__v16hi) __A,
+                                         _mm256_setzero_ph (),
+                                         (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi16_ph (__m256h __A, __mmask16 __B, __m256i __C)
+{
+  return __builtin_ia32_vcvtw2ph256_mask ((__v16hi) __C,
+                                         __A,
+                                         __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi16_ph (__mmask16 __A, __m256i __B)
+{
+  return __builtin_ia32_vcvtw2ph256_mask ((__v16hi) __B,
+                                         _mm256_setzero_ph (),
+                                         __A);
+}
+
+/* Intrinsics vcvtuw2ph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu16_ph (__m128i __A)
+{
+  return __builtin_ia32_vcvtuw2ph128_mask ((__v8hi) __A,
+                                          _mm_setzero_ph (),
+                                          (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepu16_ph (__m128h __A, __mmask8 __B, __m128i __C)
+{
+  return __builtin_ia32_vcvtuw2ph128_mask ((__v8hi) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepu16_ph (__mmask8 __A, __m128i __B)
+{
+  return __builtin_ia32_vcvtuw2ph128_mask ((__v8hi) __B,
+                                          _mm_setzero_ph (),
+                                          __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu16_ph (__m256i __A)
+{
+  return __builtin_ia32_vcvtuw2ph256_mask ((__v16hi) __A,
+                                          _mm256_setzero_ph (),
+                                          (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepu16_ph (__m256h __A, __mmask16 __B, __m256i __C)
+{
+  return __builtin_ia32_vcvtuw2ph256_mask ((__v16hi) __C, __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepu16_ph (__mmask16 __A, __m256i __B)
+{
+  return __builtin_ia32_vcvtuw2ph256_mask ((__v16hi) __B,
+                                          _mm256_setzero_ph (),
+                                          __A);
+}
+
+/* Intrinsics vcvtph2pd.  */
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtph_pd (__m128h __A)
+{
+  return __builtin_ia32_vcvtph2pd128_mask (__A,
+                                          _mm_setzero_pd (),
+                                          (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtph_pd (__m128d __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvtph2pd128_mask (__C, __A, __B);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtph_pd (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvtph2pd128_mask (__B, _mm_setzero_pd (), __A);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtph_pd (__m128h __A)
+{
+  return __builtin_ia32_vcvtph2pd256_mask (__A,
+                                          _mm256_setzero_pd (),
+                                          (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtph_pd (__m256d __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvtph2pd256_mask (__C, __A, __B);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtph_pd (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvtph2pd256_mask (__B,
+                                          _mm256_setzero_pd (),
+                                          __A);
+}
+
+/* Intrinsics vcvtph2ps.  */
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtxph_ps (__m128h __A)
+{
+  return __builtin_ia32_vcvtph2psx128_mask (__A,
+                                          _mm_setzero_ps (),
+                                          (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtxph_ps (__m128 __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvtph2psx128_mask (__C, __A, __B);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtxph_ps (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvtph2psx128_mask (__B, _mm_setzero_ps (), __A);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtxph_ps (__m128h __A)
+{
+  return __builtin_ia32_vcvtph2psx256_mask (__A,
+                                           _mm256_setzero_ps (),
+                                           (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtxph_ps (__m256 __A, __mmask8 __B, __m128h __C)
+{
+  return __builtin_ia32_vcvtph2psx256_mask (__C, __A, __B);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtxph_ps (__mmask8 __A, __m128h __B)
+{
+  return __builtin_ia32_vcvtph2psx256_mask (__B,
+                                           _mm256_setzero_ps (),
+                                           __A);
+}
+
+/* Intrinsics vcvtxps2ph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtxps_ph (__m128 __A)
+{
+  return __builtin_ia32_vcvtps2phx128_mask ((__v4sf) __A,
+                                           _mm_setzero_ph (),
+                                           (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtxps_ph (__m128h __A, __mmask8 __B, __m128 __C)
+{
+  return __builtin_ia32_vcvtps2phx128_mask ((__v4sf) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtxps_ph (__mmask8 __A, __m128 __B)
+{
+  return __builtin_ia32_vcvtps2phx128_mask ((__v4sf) __B,
+                                           _mm_setzero_ph (),
+                                           __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtxps_ph (__m256 __A)
+{
+  return __builtin_ia32_vcvtps2phx256_mask ((__v8sf) __A,
+                                           _mm_setzero_ph (),
+                                           (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtxps_ph (__m128h __A, __mmask8 __B, __m256 __C)
+{
+  return __builtin_ia32_vcvtps2phx256_mask ((__v8sf) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtxps_ph (__mmask8 __A, __m256 __B)
+{
+  return __builtin_ia32_vcvtps2phx256_mask ((__v8sf) __B,
+                                           _mm_setzero_ph (),
+                                           __A);
+}
+
+/* Intrinsics vcvtpd2ph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpd_ph (__m128d __A)
+{
+  return __builtin_ia32_vcvtpd2ph128_mask ((__v2df) __A,
+                                          _mm_setzero_ph (),
+                                          (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtpd_ph (__m128h __A, __mmask8 __B, __m128d __C)
+{
+  return __builtin_ia32_vcvtpd2ph128_mask ((__v2df) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtpd_ph (__mmask8 __A, __m128d __B)
+{
+  return __builtin_ia32_vcvtpd2ph128_mask ((__v2df) __B,
+                                          _mm_setzero_ph (),
+                                          __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtpd_ph (__m256d __A)
+{
+  return __builtin_ia32_vcvtpd2ph256_mask ((__v4df) __A,
+                                          _mm_setzero_ph (),
+                                          (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtpd_ph (__m128h __A, __mmask8 __B, __m256d __C)
+{
+  return __builtin_ia32_vcvtpd2ph256_mask ((__v4df) __C, __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtpd_ph (__mmask8 __A, __m256d __B)
+{
+  return __builtin_ia32_vcvtpd2ph256_mask ((__v4df) __B,
+                                          _mm_setzero_ph (),
+                                          __A);
+}
+
+/* Intrinsics vfmaddsub[132,213,231]ph.  */
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmaddsub_ph (__m256h __A, __m256h __B, __m256h __C)
+{
+  return (__m256h)__builtin_ia32_vfmaddsubph256_mask ((__v16hf)__A,
+                                                     (__v16hf)__B,
+                                                     (__v16hf)__C,
+                                                     (__mmask16)-1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmaddsub_ph (__m256h __A, __mmask16 __U, __m256h __B,
+                        __m256h __C)
+{
+  return (__m256h) __builtin_ia32_vfmaddsubph256_mask ((__v16hf) __A,
+                                                      (__v16hf) __B,
+                                                      (__v16hf) __C,
+                                                      (__mmask16) __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmaddsub_ph (__m256h __A, __m256h __B, __m256h __C,
+                         __mmask16 __U)
+{
+  return (__m256h) __builtin_ia32_vfmaddsubph256_mask3 ((__v16hf) __A,
+                                                       (__v16hf) __B,
+                                                       (__v16hf) __C,
+                                                       (__mmask16)
+                                                       __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmaddsub_ph (__mmask16 __U, __m256h __A, __m256h __B,
+                         __m256h __C)
+{
+  return (__m256h) __builtin_ia32_vfmaddsubph256_maskz ((__v16hf) __A,
+                                                       (__v16hf) __B,
+                                                       (__v16hf) __C,
+                                                       (__mmask16)
+                                                       __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmaddsub_ph (__m128h __A, __m128h __B, __m128h __C)
+{
+  return (__m128h)__builtin_ia32_vfmaddsubph128_mask ((__v8hf)__A,
+                                                     (__v8hf)__B,
+                                                     (__v8hf)__C,
+                                                     (__mmask8)-1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmaddsub_ph (__m128h __A, __mmask8 __U, __m128h __B,
+                     __m128h __C)
+{
+  return (__m128h) __builtin_ia32_vfmaddsubph128_mask ((__v8hf) __A,
+                                                      (__v8hf) __B,
+                                                      (__v8hf) __C,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmaddsub_ph (__m128h __A, __m128h __B, __m128h __C,
+                      __mmask8 __U)
+{
+  return (__m128h) __builtin_ia32_vfmaddsubph128_mask3 ((__v8hf) __A,
+                                                       (__v8hf) __B,
+                                                       (__v8hf) __C,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmaddsub_ph (__mmask8 __U, __m128h __A, __m128h __B,
+                      __m128h __C)
+{
+  return (__m128h) __builtin_ia32_vfmaddsubph128_maskz ((__v8hf) __A,
+                                                       (__v8hf) __B,
+                                                       (__v8hf) __C,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+/* Intrinsics vfmsubadd[132,213,231]ph.  */
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmsubadd_ph (__m256h __A, __m256h __B, __m256h __C)
+{
+  return (__m256h) __builtin_ia32_vfmsubaddph256_mask ((__v16hf) __A,
+                                                      (__v16hf) __B,
+                                                      (__v16hf) __C,
+                                                      (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmsubadd_ph (__m256h __A, __mmask16 __U, __m256h __B,
+                        __m256h __C)
+{
+  return (__m256h) __builtin_ia32_vfmsubaddph256_mask ((__v16hf) __A,
+                                                      (__v16hf) __B,
+                                                      (__v16hf) __C,
+                                                      (__mmask16) __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmsubadd_ph (__m256h __A, __m256h __B, __m256h __C,
+                         __mmask16 __U)
+{
+  return (__m256h) __builtin_ia32_vfmsubaddph256_mask3 ((__v16hf) __A,
+                                                       (__v16hf) __B,
+                                                       (__v16hf) __C,
+                                                       (__mmask16)
+                                                       __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmsubadd_ph (__mmask16 __U, __m256h __A, __m256h __B,
+                         __m256h __C)
+{
+  return (__m256h) __builtin_ia32_vfmsubaddph256_maskz ((__v16hf) __A,
+                                                       (__v16hf) __B,
+                                                       (__v16hf) __C,
+                                                       (__mmask16)
+                                                       __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmsubadd_ph (__m128h __A, __m128h __B, __m128h __C)
+{
+  return (__m128h) __builtin_ia32_vfmsubaddph128_mask ((__v8hf) __A,
+                                                      (__v8hf) __B,
+                                                      (__v8hf) __C,
+                                                      (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmsubadd_ph (__m128h __A, __mmask8 __U, __m128h __B,
+                     __m128h __C)
+{
+  return (__m128h) __builtin_ia32_vfmsubaddph128_mask ((__v8hf) __A,
+                                                      (__v8hf) __B,
+                                                      (__v8hf) __C,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmsubadd_ph (__m128h __A, __m128h __B, __m128h __C,
+                      __mmask8 __U)
+{
+  return (__m128h) __builtin_ia32_vfmsubaddph128_mask3 ((__v8hf) __A,
+                                                       (__v8hf) __B,
+                                                       (__v8hf) __C,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmsubadd_ph (__mmask8 __U, __m128h __A, __m128h __B,
+                      __m128h __C)
+{
+  return (__m128h) __builtin_ia32_vfmsubaddph128_maskz ((__v8hf) __A,
+                                                       (__v8hf) __B,
+                                                       (__v8hf) __C,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+/* Intrinsics vfmadd[132,213,231]ph.  */
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmadd_ph (__m256h __A, __m256h __B, __m256h __C)
+{
+  return (__m256h) __builtin_ia32_vfmaddph256_mask ((__v16hf) __A,
+                                                      (__v16hf) __B,
+                                                      (__v16hf) __C,
+                                                      (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmadd_ph (__m256h __A, __mmask16 __U, __m256h __B,
+                        __m256h __C)
+{
+  return (__m256h) __builtin_ia32_vfmaddph256_mask ((__v16hf) __A,
+                                                      (__v16hf) __B,
+                                                      (__v16hf) __C,
+                                                      (__mmask16) __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmadd_ph (__m256h __A, __m256h __B, __m256h __C,
+                         __mmask16 __U)
+{
+  return (__m256h) __builtin_ia32_vfmaddph256_mask3 ((__v16hf) __A,
+                                                       (__v16hf) __B,
+                                                       (__v16hf) __C,
+                                                       (__mmask16)
+                                                       __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmadd_ph (__mmask16 __U, __m256h __A, __m256h __B,
+                         __m256h __C)
+{
+  return (__m256h) __builtin_ia32_vfmaddph256_maskz ((__v16hf) __A,
+                                                       (__v16hf) __B,
+                                                       (__v16hf) __C,
+                                                       (__mmask16)
+                                                       __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_ph (__m128h __A, __m128h __B, __m128h __C)
+{
+  return (__m128h) __builtin_ia32_vfmaddph128_mask ((__v8hf) __A,
+                                                      (__v8hf) __B,
+                                                      (__v8hf) __C,
+                                                      (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmadd_ph (__m128h __A, __mmask8 __U, __m128h __B,
+                     __m128h __C)
+{
+  return (__m128h) __builtin_ia32_vfmaddph128_mask ((__v8hf) __A,
+                                                      (__v8hf) __B,
+                                                      (__v8hf) __C,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmadd_ph (__m128h __A, __m128h __B, __m128h __C,
+                      __mmask8 __U)
+{
+  return (__m128h) __builtin_ia32_vfmaddph128_mask3 ((__v8hf) __A,
+                                                       (__v8hf) __B,
+                                                       (__v8hf) __C,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmadd_ph (__mmask8 __U, __m128h __A, __m128h __B,
+                      __m128h __C)
+{
+  return (__m128h) __builtin_ia32_vfmaddph128_maskz ((__v8hf) __A,
+                                                       (__v8hf) __B,
+                                                       (__v8hf) __C,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+/* Intrinsics vfnmadd[132,213,231]ph.  */
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fnmadd_ph (__m256h __A, __m256h __B, __m256h __C)
+{
+  return (__m256h) __builtin_ia32_vfnmaddph256_mask ((__v16hf) __A,
+                                                      (__v16hf) __B,
+                                                      (__v16hf) __C,
+                                                      (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fnmadd_ph (__m256h __A, __mmask16 __U, __m256h __B,
+                        __m256h __C)
+{
+  return (__m256h) __builtin_ia32_vfnmaddph256_mask ((__v16hf) __A,
+                                                      (__v16hf) __B,
+                                                      (__v16hf) __C,
+                                                      (__mmask16) __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fnmadd_ph (__m256h __A, __m256h __B, __m256h __C,
+                         __mmask16 __U)
+{
+  return (__m256h) __builtin_ia32_vfnmaddph256_mask3 ((__v16hf) __A,
+                                                       (__v16hf) __B,
+                                                       (__v16hf) __C,
+                                                       (__mmask16)
+                                                       __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fnmadd_ph (__mmask16 __U, __m256h __A, __m256h __B,
+                         __m256h __C)
+{
+  return (__m256h) __builtin_ia32_vfnmaddph256_maskz ((__v16hf) __A,
+                                                       (__v16hf) __B,
+                                                       (__v16hf) __C,
+                                                       (__mmask16)
+                                                       __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmadd_ph (__m128h __A, __m128h __B, __m128h __C)
+{
+  return (__m128h) __builtin_ia32_vfnmaddph128_mask ((__v8hf) __A,
+                                                      (__v8hf) __B,
+                                                      (__v8hf) __C,
+                                                      (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmadd_ph (__m128h __A, __mmask8 __U, __m128h __B,
+                     __m128h __C)
+{
+  return (__m128h) __builtin_ia32_vfnmaddph128_mask ((__v8hf) __A,
+                                                      (__v8hf) __B,
+                                                      (__v8hf) __C,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmadd_ph (__m128h __A, __m128h __B, __m128h __C,
+                      __mmask8 __U)
+{
+  return (__m128h) __builtin_ia32_vfnmaddph128_mask3 ((__v8hf) __A,
+                                                       (__v8hf) __B,
+                                                       (__v8hf) __C,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmadd_ph (__mmask8 __U, __m128h __A, __m128h __B,
+                      __m128h __C)
+{
+  return (__m128h) __builtin_ia32_vfnmaddph128_maskz ((__v8hf) __A,
+                                                       (__v8hf) __B,
+                                                       (__v8hf) __C,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+/* Intrinsics vfmsub[132,213,231]ph.  */
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmsub_ph (__m256h __A, __m256h __B, __m256h __C)
+{
+  return (__m256h) __builtin_ia32_vfmsubph256_mask ((__v16hf) __A,
+                                                      (__v16hf) __B,
+                                                      (__v16hf) __C,
+                                                      (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmsub_ph (__m256h __A, __mmask16 __U, __m256h __B,
+                        __m256h __C)
+{
+  return (__m256h) __builtin_ia32_vfmsubph256_mask ((__v16hf) __A,
+                                                      (__v16hf) __B,
+                                                      (__v16hf) __C,
+                                                      (__mmask16) __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmsub_ph (__m256h __A, __m256h __B, __m256h __C,
+                         __mmask16 __U)
+{
+  return (__m256h) __builtin_ia32_vfmsubph256_mask3 ((__v16hf) __A,
+                                                       (__v16hf) __B,
+                                                       (__v16hf) __C,
+                                                       (__mmask16)
+                                                       __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmsub_ph (__mmask16 __U, __m256h __A, __m256h __B,
+                         __m256h __C)
+{
+  return (__m256h) __builtin_ia32_vfmsubph256_maskz ((__v16hf) __A,
+                                                       (__v16hf) __B,
+                                                       (__v16hf) __C,
+                                                       (__mmask16)
+                                                       __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmsub_ph (__m128h __A, __m128h __B, __m128h __C)
+{
+  return (__m128h) __builtin_ia32_vfmsubph128_mask ((__v8hf) __A,
+                                                      (__v8hf) __B,
+                                                      (__v8hf) __C,
+                                                      (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmsub_ph (__m128h __A, __mmask8 __U, __m128h __B,
+                     __m128h __C)
+{
+  return (__m128h) __builtin_ia32_vfmsubph128_mask ((__v8hf) __A,
+                                                      (__v8hf) __B,
+                                                      (__v8hf) __C,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmsub_ph (__m128h __A, __m128h __B, __m128h __C,
+                      __mmask8 __U)
+{
+  return (__m128h) __builtin_ia32_vfmsubph128_mask3 ((__v8hf) __A,
+                                                       (__v8hf) __B,
+                                                       (__v8hf) __C,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmsub_ph (__mmask8 __U, __m128h __A, __m128h __B,
+                      __m128h __C)
+{
+  return (__m128h) __builtin_ia32_vfmsubph128_maskz ((__v8hf) __A,
+                                                       (__v8hf) __B,
+                                                       (__v8hf) __C,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+/* Intrinsics vfnmsub[132,213,231]ph.  */
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fnmsub_ph (__m256h __A, __m256h __B, __m256h __C)
+{
+  return (__m256h) __builtin_ia32_vfnmsubph256_mask ((__v16hf) __A,
+                                                      (__v16hf) __B,
+                                                      (__v16hf) __C,
+                                                      (__mmask16) -1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fnmsub_ph (__m256h __A, __mmask16 __U, __m256h __B,
+                        __m256h __C)
+{
+  return (__m256h) __builtin_ia32_vfnmsubph256_mask ((__v16hf) __A,
+                                                      (__v16hf) __B,
+                                                      (__v16hf) __C,
+                                                      (__mmask16) __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fnmsub_ph (__m256h __A, __m256h __B, __m256h __C,
+                         __mmask16 __U)
+{
+  return (__m256h) __builtin_ia32_vfnmsubph256_mask3 ((__v16hf) __A,
+                                                       (__v16hf) __B,
+                                                       (__v16hf) __C,
+                                                       (__mmask16)
+                                                       __U);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fnmsub_ph (__mmask16 __U, __m256h __A, __m256h __B,
+                         __m256h __C)
+{
+  return (__m256h) __builtin_ia32_vfnmsubph256_maskz ((__v16hf) __A,
+                                                       (__v16hf) __B,
+                                                       (__v16hf) __C,
+                                                       (__mmask16)
+                                                       __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmsub_ph (__m128h __A, __m128h __B, __m128h __C)
+{
+  return (__m128h) __builtin_ia32_vfnmsubph128_mask ((__v8hf) __A,
+                                                      (__v8hf) __B,
+                                                      (__v8hf) __C,
+                                                      (__mmask8) -1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmsub_ph (__m128h __A, __mmask8 __U, __m128h __B,
+                     __m128h __C)
+{
+  return (__m128h) __builtin_ia32_vfnmsubph128_mask ((__v8hf) __A,
+                                                      (__v8hf) __B,
+                                                      (__v8hf) __C,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmsub_ph (__m128h __A, __m128h __B, __m128h __C,
+                      __mmask8 __U)
+{
+  return (__m128h) __builtin_ia32_vfnmsubph128_mask3 ((__v8hf) __A,
+                                                       (__v8hf) __B,
+                                                       (__v8hf) __C,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmsub_ph (__mmask8 __U, __m128h __A, __m128h __B,
+                      __m128h __C)
+{
+  return (__m128h) __builtin_ia32_vfnmsubph128_maskz ((__v8hf) __A,
+                                                       (__v8hf) __B,
+                                                       (__v8hf) __C,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+/* Intrinsics vf[,c]maddcph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_pch (__m128h __A, __m128h __B, __m128h __C)
+{
+  return (__m128h) __builtin_ia32_vfmaddcph128 ((__v8hf) __A,
+                                               (__v8hf) __B,
+                                               (__v8hf) __C);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmadd_pch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return (__m128h)
+    __builtin_ia32_vfmaddcph128_mask ((__v8hf) __A,
+                                     (__v8hf) __C,
+                                     (__v8hf) __D, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmadd_pch (__m128h __A, __m128h __B, __m128h __C,  __mmask8 __D)
+{
+  return (__m128h)
+    __builtin_ia32_vfmaddcph128_mask3 ((__v8hf) __A,
+                                      (__v8hf) __B,
+                                      (__v8hf) __C, __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmadd_pch (__mmask8 __A, __m128h __B, __m128h __C, __m128h __D)
+{
+  return (__m128h) __builtin_ia32_vfmaddcph128_maskz ((__v8hf) __B,
+                                                     (__v8hf) __C,
+                                                     (__v8hf) __D, __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmadd_pch (__m256h __A, __m256h __B, __m256h __C)
+{
+  return (__m256h) __builtin_ia32_vfmaddcph256 ((__v16hf) __A,
+                                               (__v16hf) __B,
+                                               (__v16hf) __C);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmadd_pch (__m256h __A, __mmask8 __B, __m256h __C, __m256h __D)
+{
+  return (__m256h)
+     __builtin_ia32_vfmaddcph256_mask ((__v16hf) __A,
+                                      (__v16hf) __C,
+                                      (__v16hf) __D, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmadd_pch (__m256h __A, __m256h __B, __m256h __C,  __mmask8 __D)
+{
+  return (__m256h)
+    __builtin_ia32_vfmaddcph256_mask3 ((__v16hf) __A,
+                                      (__v16hf) __B,
+                                      (__v16hf) __C, __D);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmadd_pch (__mmask8 __A, __m256h __B, __m256h __C, __m256h __D)
+{
+  return (__m256h)__builtin_ia32_vfmaddcph256_maskz ((__v16hf) __B,
+                                                    (__v16hf) __C,
+                                                    (__v16hf) __D, __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fcmadd_pch (__m128h __A, __m128h __B, __m128h __C)
+{
+  return (__m128h) __builtin_ia32_vfcmaddcph128 ((__v8hf) __A,
+                                                (__v8hf) __B,
+                                                (__v8hf) __C);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fcmadd_pch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return (__m128h)
+     __builtin_ia32_vfcmaddcph128_mask ((__v8hf) __A,
+                                       (__v8hf) __C,
+                                       (__v8hf) __D, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fcmadd_pch (__m128h __A, __m128h __B, __m128h __C,  __mmask8 __D)
+{
+  return (__m128h)
+    __builtin_ia32_vfcmaddcph128_mask3 ((__v8hf) __A,
+                                       (__v8hf) __B,
+                                       (__v8hf) __C, __D);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fcmadd_pch (__mmask8 __A, __m128h __B, __m128h __C, __m128h __D)
+{
+  return (__m128h)__builtin_ia32_vfcmaddcph128_maskz ((__v8hf) __B,
+                                                     (__v8hf) __C,
+                                                     (__v8hf) __D, __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fcmadd_pch (__m256h __A, __m256h __B, __m256h __C)
+{
+  return (__m256h) __builtin_ia32_vfcmaddcph256 ((__v16hf) __A,
+                                                (__v16hf) __B,
+                                                (__v16hf) __C);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fcmadd_pch (__m256h __A, __mmask8 __B, __m256h __C, __m256h __D)
+{
+  return (__m256h)
+     __builtin_ia32_vfcmaddcph256_mask ((__v16hf) __A,
+                                       (__v16hf) __C,
+                                       (__v16hf) __D, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fcmadd_pch (__m256h __A, __m256h __B, __m256h __C,  __mmask8 __D)
+{
+  return (__m256h)
+    __builtin_ia32_vfcmaddcph256_mask3 ((__v16hf) __A,
+                                       (__v16hf) __B,
+                                       (__v16hf) __C, __D);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fcmadd_pch (__mmask8 __A, __m256h __B, __m256h __C, __m256h __D)
+{
+  return (__m256h) __builtin_ia32_vfcmaddcph256_maskz ((__v16hf) __B,
+                                                      (__v16hf) __C,
+                                                      (__v16hf) __D, __A);
+}
+
+/* Intrinsics vf[,c]mulcph.  */
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmul_pch (__m128h __A, __m128h __B)
+{
+  return (__m128h) __builtin_ia32_vfmulcph128 ((__v8hf) __A, (__v8hf) __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmul_pch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return (__m128h) __builtin_ia32_vfmulcph128_mask ((__v8hf) __C,
+                                                   (__v8hf) __D,
+                                                   (__v8hf) __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmul_pch (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return (__m128h) __builtin_ia32_vfmulcph128_mask ((__v8hf) __B,
+                                                   (__v8hf) __C,
+                                                   _mm_setzero_ph (),
+                                                   __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmul_pch (__m256h __A, __m256h __B)
+{
+  return (__m256h) __builtin_ia32_vfmulcph256 ((__v16hf) __A,
+                                              (__v16hf) __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmul_pch (__m256h __A, __mmask8 __B, __m256h __C, __m256h __D)
+{
+  return (__m256h) __builtin_ia32_vfmulcph256_mask ((__v16hf) __C,
+                                                   (__v16hf) __D,
+                                                   (__v16hf) __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmul_pch (__mmask8 __A, __m256h __B, __m256h __C)
+{
+  return (__m256h) __builtin_ia32_vfmulcph256_mask ((__v16hf) __B,
+                                                   (__v16hf) __C,
+                                                   _mm256_setzero_ph (),
+                                                   __A);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fcmul_pch (__m128h __A, __m128h __B)
+{
+  return (__m128h) __builtin_ia32_vfcmulcph128 ((__v8hf) __A,
+                                               (__v8hf) __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fcmul_pch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D)
+{
+  return (__m128h) __builtin_ia32_vfcmulcph128_mask ((__v8hf) __C,
+                                                    (__v8hf) __D,
+                                                    (__v8hf) __A, __B);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fcmul_pch (__mmask8 __A, __m128h __B, __m128h __C)
+{
+  return (__m128h) __builtin_ia32_vfcmulcph128_mask ((__v8hf) __B,
+                                                    (__v8hf) __C,
+                                                    _mm_setzero_ph (),
+                                                    __A);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fcmul_pch (__m256h __A, __m256h __B)
+{
+  return (__m256h) __builtin_ia32_vfcmulcph256 ((__v16hf) __A, (__v16hf) __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fcmul_pch (__m256h __A, __mmask8 __B, __m256h __C, __m256h __D)
+{
+  return (__m256h) __builtin_ia32_vfcmulcph256_mask ((__v16hf) __C,
+                                                    (__v16hf) __D,
+                                                    (__v16hf) __A, __B);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fcmul_pch (__mmask8 __A, __m256h __B, __m256h __C)
+{
+  return (__m256h) __builtin_ia32_vfcmulcph256_mask ((__v16hf) __B,
+                                                    (__v16hf) __C,
+                                                    _mm256_setzero_ph (),
+                                                    __A);
+}
+
+#define _MM256_REDUCE_OP(op)                                           \
+  __m128h __T1 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 0);   \
+  __m128h __T2 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 1);   \
+  __m128h __T3 = (__T1 op __T2);                                       \
+  __m128h __T4 = (__m128h) __builtin_shuffle (__T3,                    \
+                (__v8hi) { 4, 5, 6, 7, 0, 1, 2, 3 });                  \
+  __m128h __T5 = (__T3) op (__T4);                                     \
+  __m128h __T6 = (__m128h) __builtin_shuffle (__T5,                    \
+                (__v8hi) { 2, 3, 0, 1, 4, 5, 6, 7 });                  \
+  __m128h __T7 = __T5 op __T6;                                         \
+  return __T7[0] op __T7[1]
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_add_ph (__m256h __A)
+{
+  _MM256_REDUCE_OP (+);
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_mul_ph (__m256h __A)
+{
+  _MM256_REDUCE_OP (*);
+}
+
+#undef _MM256_REDUCE_OP
+#define _MM256_REDUCE_OP(op)                                           \
+  __m128h __T1 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 0);   \
+  __m128h __T2 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 1);   \
+  __m128h __T3 = _mm_##op (__T1, __T2);                                \
+  __m128h __T4 = (__m128h) __builtin_shuffle (__T3,                    \
+                (__v8hi) { 2, 3, 0, 1, 6, 7, 4, 5 });                  \
+  __m128h __T5 = _mm_##op (__T3, __T4);                                \
+  __m128h __T6 = (__m128h) __builtin_shuffle (__T5, (__v8hi) { 4, 5 }); \
+  __m128h __T7 = _mm_##op (__T5, __T6);                                \
+  __m128h __T8 = (__m128h) __builtin_shuffle (__T7, (__v8hi) { 1, 0 }); \
+  __m128h __T9 = _mm_##op (__T7, __T8);                                \
+  return __T9[0]
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_min_ph (__m256h __A)
+{
+  _MM256_REDUCE_OP (min_ph);
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_max_ph (__m256h __A)
+{
+  _MM256_REDUCE_OP (max_ph);
+}
+
+#define _MM_REDUCE_OP(op)                                              \
+  __m128h __T1 = (__m128h) __builtin_shuffle (__A,                     \
+                (__v8hi) { 4, 5, 6, 7, 0, 1, 2, 3 });                  \
+  __m128h __T2 = (__A) op (__T1);                                      \
+  __m128h __T3 = (__m128h) __builtin_shuffle (__T2,                    \
+                (__v8hi){ 2, 3, 0, 1, 4, 5, 6, 7 });                   \
+  __m128h __T4 = __T2 op __T3;                                         \
+  return __T4[0] op __T4[1]
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_add_ph (__m128h __A)
+{
+  _MM_REDUCE_OP (+);
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_mul_ph (__m128h __A)
+{
+  _MM_REDUCE_OP (*);
+}
+
+#undef _MM_REDUCE_OP
+#define _MM_REDUCE_OP(op)                                              \
+  __m128h __T1 = (__m128h) __builtin_shuffle (__A,                     \
+                (__v8hi) { 2, 3, 0, 1, 6, 7, 4, 5 });                  \
+  __m128h __T2 = _mm_##op (__A, __T1);                                 \
+  __m128h __T3 = (__m128h) __builtin_shuffle (__T2, (__v8hi){ 4, 5 }); \
+  __m128h __T4 = _mm_##op (__T2, __T3);                                \
+  __m128h __T5 = (__m128h) __builtin_shuffle (__T4, (__v8hi){ 1, 0 }); \
+  __m128h __T6 = _mm_##op (__T4, __T5);                                \
+  return __T6[0]
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_min_ph (__m128h __A)
+{
+  _MM_REDUCE_OP (min_ph);
+}
+
+extern __inline _Float16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_max_ph (__m128h __A)
+{
+  _MM_REDUCE_OP (max_ph);
+}
+
+#undef _MM256_REDUCE_OP
+#undef _MM_REDUCE_OP
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_blend_ph (__mmask16 __U, __m256h __A, __m256h __W)
+{
+  return (__m256h) __builtin_ia32_movdquhi256_mask ((__v16hi) __W,
+                                                   (__v16hi) __A,
+                                                   (__mmask16) __U);
+
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutex2var_ph (__m256h __A, __m256i __I, __m256h __B)
+{
+  return (__m256h) __builtin_ia32_vpermi2varhi256_mask ((__v16hi) __A,
+                                                      (__v16hi) __I,
+                                                      (__v16hi) __B,
+                                                      (__mmask16)-1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutexvar_ph (__m256i __A, __m256h __B)
+{
+  return (__m256h) __builtin_ia32_permvarhi256_mask ((__v16hi) __B,
+                                                    (__v16hi) __A,
+                                                    (__v16hi)
+                                                    (_mm256_setzero_ph ()),
+                                                    (__mmask16)-1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_blend_ph (__mmask8 __U, __m128h __A, __m128h __W)
+{
+  return (__m128h) __builtin_ia32_movdquhi128_mask ((__v8hi) __W,
+                                                   (__v8hi) __A,
+                                                   (__mmask8) __U);
+
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutex2var_ph (__m128h __A, __m128i __I, __m128h __B)
+{
+  return (__m128h) __builtin_ia32_vpermi2varhi128_mask ((__v8hi) __A,
+                                                      (__v8hi) __I,
+                                                      (__v8hi) __B,
+                                                      (__mmask8)-1);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutexvar_ph (__m128i __A, __m128h __B)
+{
+  return (__m128h) __builtin_ia32_permvarhi128_mask ((__v8hi) __B,
+                                                    (__v8hi) __A,
+                                                    (__v8hi)
+                                                    (_mm_setzero_ph ()),
+                                                    (__mmask8)-1);
+}
+
+extern __inline __m256h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_pch (_Float16 _Complex __A)
+{
+  union
+  {
+    _Float16 _Complex __a;
+    float __b;
+  } __u = { .__a = __A };
+
+  return (__m256h) _mm256_set1_ps (__u.__b);
+}
+
+extern __inline __m128h
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_pch (_Float16 _Complex __A)
+{
+  union
+  {
+    _Float16 _Complex __a;
+    float __b;
+  } __u = { .__a = __A };
+
+  return (__m128h) _mm_set1_ps (__u.__b);
+}
+
+// intrinsics below are alias for f*mul_*ch
+#define _mm_mul_pch(A, B) _mm_fmul_pch ((A), (B))
+#define _mm_mask_mul_pch(W, U, A, B) _mm_mask_fmul_pch ((W), (U), (A), (B))
+#define _mm_maskz_mul_pch(U, A, B) _mm_maskz_fmul_pch ((U), (A), (B))
+#define _mm256_mul_pch(A, B) _mm256_fmul_pch ((A), (B))
+#define _mm256_mask_mul_pch(W, U, A, B)                                      \
+  _mm256_mask_fmul_pch ((W), (U), (A), (B))
+#define _mm256_maskz_mul_pch(U, A, B) _mm256_maskz_fmul_pch ((U), (A), (B))
+
+#define _mm_cmul_pch(A, B) _mm_fcmul_pch ((A), (B))
+#define _mm_mask_cmul_pch(W, U, A, B) _mm_mask_fcmul_pch ((W), (U), (A), (B))
+#define _mm_maskz_cmul_pch(U, A, B) _mm_maskz_fcmul_pch ((U), (A), (B))
+#define _mm256_cmul_pch(A, B) _mm256_fcmul_pch ((A), (B))
+#define _mm256_mask_cmul_pch(W, U, A, B)                             \
+   _mm256_mask_fcmul_pch ((W), (U), (A), (B))
+#define _mm256_maskz_cmul_pch(U, A, B) _mm256_maskz_fcmul_pch((U), (A), (B))
+
+#ifdef __DISABLE_AVX512FP16VL__
+#undef __DISABLE_AVX512FP16VL__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512FP16VL__ */
+
+#endif /* __AVX512FP16VLINTRIN_H_INCLUDED */
diff --git a/include-gcc/avx512ifmaintrin.h b/include-gcc/avx512ifmaintrin.h
new file mode 100644 (file)
index 0000000..fc97f1d
--- /dev/null
@@ -0,0 +1,104 @@
+/* Copyright (C) 2013-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512ifmaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512IFMAINTRIN_H_INCLUDED
+#define _AVX512IFMAINTRIN_H_INCLUDED
+
+#ifndef __AVX512IFMA__
+#pragma GCC push_options
+#pragma GCC target("avx512ifma")
+#define __DISABLE_AVX512IFMA__
+#endif /* __AVX512IFMA__ */
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_madd52lo_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
+{
+  return (__m512i) __builtin_ia32_vpmadd52luq512_mask ((__v8di) __X,
+                                                      (__v8di) __Y,
+                                                      (__v8di) __Z,
+                                                      (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z)
+{
+  return (__m512i) __builtin_ia32_vpmadd52huq512_mask ((__v8di) __X,
+                                                      (__v8di) __Y,
+                                                      (__v8di) __Z,
+                                                      (__mmask8) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X,
+                           __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_vpmadd52luq512_mask ((__v8di) __W,
+                                                      (__v8di) __X,
+                                                      (__v8di) __Y,
+                                                      (__mmask8) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X,
+                           __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_vpmadd52huq512_mask ((__v8di) __W,
+                                                      (__v8di) __X,
+                                                      (__v8di) __Y,
+                                                      (__mmask8) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_madd52lo_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
+{
+  return (__m512i) __builtin_ia32_vpmadd52luq512_maskz ((__v8di) __X,
+                                                       (__v8di) __Y,
+                                                       (__v8di) __Z,
+                                                       (__mmask8) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_madd52hi_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z)
+{
+  return (__m512i) __builtin_ia32_vpmadd52huq512_maskz ((__v8di) __X,
+                                                       (__v8di) __Y,
+                                                       (__v8di) __Z,
+                                                       (__mmask8) __M);
+}
+
+#ifdef __DISABLE_AVX512IFMA__
+#undef __DISABLE_AVX512IFMA__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512IFMA__ */
+
+#endif /* _AVX512IFMAINTRIN_H_INCLUDED */
diff --git a/include-gcc/avx512ifmavlintrin.h b/include-gcc/avx512ifmavlintrin.h
new file mode 100644 (file)
index 0000000..cac55fe
--- /dev/null
@@ -0,0 +1,145 @@
+/* Copyright (C) 2013-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512ifmavlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512IFMAVLINTRIN_H_INCLUDED
+#define _AVX512IFMAVLINTRIN_H_INCLUDED
+
+#if !defined(__AVX512VL__) || !defined(__AVX512IFMA__)
+#pragma GCC push_options
+#pragma GCC target("avx512ifma,avx512vl")
+#define __DISABLE_AVX512IFMAVL__
+#endif /* __AVX512IFMAVL__ */
+
+#define _mm_madd52lo_epu64(A, B, C)                      \
+  ((__m128i) __builtin_ia32_vpmadd52luq128 ((__v2di) (A), \
+                                           (__v2di) (B), \
+                                           (__v2di) (C)))
+
+#define _mm_madd52hi_epu64(A, B, C)                      \
+  ((__m128i) __builtin_ia32_vpmadd52huq128 ((__v2di) (A), \
+                                           (__v2di) (B), \
+                                           (__v2di) (C)))
+
+#define _mm256_madd52lo_epu64(A, B, C)                   \
+  ((__m256i) __builtin_ia32_vpmadd52luq256 ((__v4di) (A), \
+                                           (__v4di) (B), \
+                                           (__v4di) (C)))
+
+
+#define _mm256_madd52hi_epu64(A, B, C)                   \
+  ((__m256i) __builtin_ia32_vpmadd52huq256 ((__v4di) (A), \
+                                           (__v4di) (B), \
+                                           (__v4di) (C)))
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_vpmadd52luq128_mask ((__v2di) __W,
+                                                      (__v2di) __X,
+                                                      (__v2di) __Y,
+                                                      (__mmask8) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_vpmadd52huq128_mask ((__v2di) __W,
+                                                      (__v2di) __X,
+                                                      (__v2di) __Y,
+                                                      (__mmask8) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X,
+                           __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_vpmadd52luq256_mask ((__v4di) __W,
+                                                      (__v4di) __X,
+                                                      (__v4di) __Y,
+                                                      (__mmask8) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X,
+                           __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_vpmadd52huq256_mask ((__v4di) __W,
+                                                      (__v4di) __X,
+                                                      (__v4di) __Y,
+                                                      (__mmask8) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
+{
+  return (__m128i) __builtin_ia32_vpmadd52luq128_maskz ((__v2di) __X,
+                                                       (__v2di) __Y,
+                                                       (__v2di) __Z,
+                                                       (__mmask8) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
+{
+  return (__m128i) __builtin_ia32_vpmadd52huq128_maskz ((__v2di) __X,
+                                                       (__v2di) __Y,
+                                                       (__v2di) __Z,
+                                                       (__mmask8) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_madd52lo_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
+{
+  return (__m256i) __builtin_ia32_vpmadd52luq256_maskz ((__v4di) __X,
+                                                       (__v4di) __Y,
+                                                       (__v4di) __Z,
+                                                       (__mmask8) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z)
+{
+  return (__m256i) __builtin_ia32_vpmadd52huq256_maskz ((__v4di) __X,
+                                                       (__v4di) __Y,
+                                                       (__v4di) __Z,
+                                                       (__mmask8) __M);
+}
+
+#ifdef __DISABLE_AVX512IFMAVL__
+#undef __DISABLE_AVX512IFMAVL__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512IFMAVL__ */
+
+#endif /* _AVX512IFMAVLINTRIN_H_INCLUDED */
diff --git a/include-gcc/avx512pfintrin.h b/include-gcc/avx512pfintrin.h
new file mode 100644 (file)
index 0000000..a547610
--- /dev/null
@@ -0,0 +1,269 @@
+/* Copyright (C) 2013-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512pfintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512PFINTRIN_H_INCLUDED
+#define _AVX512PFINTRIN_H_INCLUDED
+
+#ifndef __AVX512PF__
+#pragma GCC push_options
+#pragma GCC target("avx512pf")
+#define __DISABLE_AVX512PF__
+#endif /* __AVX512PF__ */
+
+/* Internal data types for implementing the intrinsics.  */
+typedef long long __v8di __attribute__ ((__vector_size__ (64)));
+typedef int __v16si __attribute__ ((__vector_size__ (64)));
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+   vector types, and their scalar components.  */
+typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__));
+
+typedef unsigned char  __mmask8;
+typedef unsigned short __mmask16;
+
+#ifdef __OPTIMIZE__
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_prefetch_i32gather_pd (__m256i __index, void const *__addr,
+                             int __scale, int __hint)
+{
+  __builtin_ia32_gatherpfdpd ((__mmask8) 0xFF, (__v8si) __index, __addr,
+                             __scale, __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_prefetch_i32gather_ps (__m512i __index, void const *__addr,
+                             int __scale, int __hint)
+{
+  __builtin_ia32_gatherpfdps ((__mmask16) 0xFFFF, (__v16si) __index, __addr,
+                             __scale, __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_prefetch_i32gather_pd (__m256i __index, __mmask8 __mask,
+                                  void const *__addr, int __scale, int __hint)
+{
+  __builtin_ia32_gatherpfdpd (__mask, (__v8si) __index, __addr, __scale,
+                             __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_prefetch_i32gather_ps (__m512i __index, __mmask16 __mask,
+                                  void const *__addr, int __scale, int __hint)
+{
+  __builtin_ia32_gatherpfdps (__mask, (__v16si) __index, __addr, __scale,
+                             __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_prefetch_i64gather_pd (__m512i __index, void const *__addr,
+                             int __scale, int __hint)
+{
+  __builtin_ia32_gatherpfqpd ((__mmask8) 0xFF, (__v8di) __index, __addr,
+                             __scale, __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_prefetch_i64gather_ps (__m512i __index, void const *__addr,
+                             int __scale, int __hint)
+{
+  __builtin_ia32_gatherpfqps ((__mmask8) 0xFF, (__v8di) __index, __addr,
+                             __scale, __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_prefetch_i64gather_pd (__m512i __index, __mmask8 __mask,
+                                  void const *__addr, int __scale, int __hint)
+{
+  __builtin_ia32_gatherpfqpd (__mask, (__v8di) __index, __addr, __scale,
+                             __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_prefetch_i64gather_ps (__m512i __index, __mmask8 __mask,
+                                  void const *__addr, int __scale, int __hint)
+{
+  __builtin_ia32_gatherpfqps (__mask, (__v8di) __index, __addr, __scale,
+                             __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_prefetch_i32scatter_pd (void *__addr, __m256i __index, int __scale,
+                              int __hint)
+{
+  __builtin_ia32_scatterpfdpd ((__mmask8) 0xFF, (__v8si) __index, __addr,
+                             __scale, __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_prefetch_i32scatter_ps (void *__addr, __m512i __index, int __scale,
+                              int __hint)
+{
+  __builtin_ia32_scatterpfdps ((__mmask16) 0xFFFF, (__v16si) __index, __addr,
+                             __scale, __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_prefetch_i32scatter_pd (void *__addr, __mmask8 __mask,
+                                   __m256i __index, int __scale, int __hint)
+{
+  __builtin_ia32_scatterpfdpd (__mask, (__v8si) __index, __addr, __scale,
+                              __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_prefetch_i32scatter_ps (void *__addr, __mmask16 __mask,
+                                   __m512i __index, int __scale, int __hint)
+{
+  __builtin_ia32_scatterpfdps (__mask, (__v16si) __index, __addr, __scale,
+                              __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_prefetch_i64scatter_pd (void *__addr, __m512i __index, int __scale,
+                              int __hint)
+{
+  __builtin_ia32_scatterpfqpd ((__mmask8) 0xFF, (__v8di) __index,__addr,
+                             __scale, __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_prefetch_i64scatter_ps (void *__addr, __m512i __index, int __scale,
+                              int __hint)
+{
+  __builtin_ia32_scatterpfqps ((__mmask8) 0xFF, (__v8di) __index, __addr,
+                             __scale, __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_prefetch_i64scatter_pd (void *__addr, __mmask8 __mask,
+                                   __m512i __index, int __scale, int __hint)
+{
+  __builtin_ia32_scatterpfqpd (__mask, (__v8di) __index, __addr, __scale,
+                              __hint);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_prefetch_i64scatter_ps (void *__addr, __mmask8 __mask,
+                                   __m512i __index, int __scale, int __hint)
+{
+  __builtin_ia32_scatterpfqps (__mask, (__v8di) __index, __addr, __scale,
+                              __hint);
+}
+
+#else
+#define _mm512_prefetch_i32gather_pd(INDEX, ADDR, SCALE, HINT)              \
+  __builtin_ia32_gatherpfdpd ((__mmask8)0xFF, (__v8si)(__m256i) (INDEX),     \
+                             (void const *) (ADDR), (int) (SCALE),          \
+                             (int) (HINT))
+
+#define _mm512_prefetch_i32gather_ps(INDEX, ADDR, SCALE, HINT)              \
+  __builtin_ia32_gatherpfdps ((__mmask16)0xFFFF, (__v16si)(__m512i) (INDEX), \
+                             (void const *) (ADDR), (int) (SCALE),          \
+                             (int) (HINT))
+
+#define _mm512_mask_prefetch_i32gather_pd(INDEX, MASK, ADDR, SCALE, HINT)    \
+  __builtin_ia32_gatherpfdpd ((__mmask8) (MASK), (__v8si)(__m256i) (INDEX),  \
+                             (void const *) (ADDR), (int) (SCALE),          \
+                             (int) (HINT))
+
+#define _mm512_mask_prefetch_i32gather_ps(INDEX, MASK, ADDR, SCALE, HINT)    \
+  __builtin_ia32_gatherpfdps ((__mmask16) (MASK), (__v16si)(__m512i) (INDEX),\
+                             (void const *) (ADDR), (int) (SCALE),          \
+                             (int) (HINT))
+
+#define _mm512_prefetch_i64gather_pd(INDEX, ADDR, SCALE, HINT)              \
+  __builtin_ia32_gatherpfqpd ((__mmask8)0xFF, (__v8di)(__m512i) (INDEX),     \
+                             (void *) (ADDR), (int) (SCALE), (int) (HINT))
+
+#define _mm512_prefetch_i64gather_ps(INDEX, ADDR, SCALE, HINT)              \
+  __builtin_ia32_gatherpfqps ((__mmask8)0xFF, (__v8di)(__m512i) (INDEX),     \
+                             (void *) (ADDR), (int) (SCALE), (int) (HINT))
+
+#define _mm512_mask_prefetch_i64gather_pd(INDEX, MASK, ADDR, SCALE, HINT)    \
+  __builtin_ia32_gatherpfqpd ((__mmask8) (MASK), (__v8di)(__m512i) (INDEX),  \
+                             (void *) (ADDR), (int) (SCALE), (int) (HINT))
+
+#define _mm512_mask_prefetch_i64gather_ps(INDEX, MASK, ADDR, SCALE, HINT)    \
+  __builtin_ia32_gatherpfqps ((__mmask8) (MASK), (__v8di)(__m512i) (INDEX),  \
+                             (void *) (ADDR), (int) (SCALE), (int) (HINT))
+
+#define _mm512_prefetch_i32scatter_pd(ADDR, INDEX, SCALE, HINT)              \
+  __builtin_ia32_scatterpfdpd ((__mmask8)0xFF, (__v8si)(__m256i) (INDEX),    \
+                              (void *) (ADDR), (int) (SCALE), (int) (HINT))
+
+#define _mm512_prefetch_i32scatter_ps(ADDR, INDEX, SCALE, HINT)              \
+  __builtin_ia32_scatterpfdps ((__mmask16)0xFFFF, (__v16si)(__m512i) (INDEX),\
+                              (void *) (ADDR), (int) (SCALE), (int) (HINT))
+
+#define _mm512_mask_prefetch_i32scatter_pd(ADDR, MASK, INDEX, SCALE, HINT)   \
+  __builtin_ia32_scatterpfdpd ((__mmask8) (MASK), (__v8si)(__m256i) (INDEX), \
+                              (void *) (ADDR), (int) (SCALE), (int) (HINT))
+
+#define _mm512_mask_prefetch_i32scatter_ps(ADDR, MASK, INDEX, SCALE, HINT)   \
+  __builtin_ia32_scatterpfdps ((__mmask16) (MASK),                          \
+                              (__v16si)(__m512i) (INDEX),                   \
+                              (void *) (ADDR), (int) (SCALE), (int) (HINT))
+
+#define _mm512_prefetch_i64scatter_pd(ADDR, INDEX, SCALE, HINT)              \
+  __builtin_ia32_scatterpfqpd ((__mmask8)0xFF, (__v8di)(__m512i) (INDEX),    \
+                              (void *) (ADDR), (int) (SCALE), (int) (HINT))
+
+#define _mm512_prefetch_i64scatter_ps(ADDR, INDEX, SCALE, HINT)              \
+  __builtin_ia32_scatterpfqps ((__mmask8)0xFF, (__v8di)(__m512i) (INDEX),    \
+                              (void *) (ADDR), (int) (SCALE), (int) (HINT))
+
+#define _mm512_mask_prefetch_i64scatter_pd(ADDR, MASK, INDEX, SCALE, HINT)   \
+  __builtin_ia32_scatterpfqpd ((__mmask8) (MASK), (__v8di)(__m512i) (INDEX), \
+                              (void *) (ADDR), (int) (SCALE), (int) (HINT))
+
+#define _mm512_mask_prefetch_i64scatter_ps(ADDR, MASK, INDEX, SCALE, HINT)   \
+  __builtin_ia32_scatterpfqps ((__mmask8) (MASK), (__v8di)(__m512i) (INDEX), \
+                              (void *) (ADDR), (int) (SCALE), (int) (HINT))
+#endif
+
+#ifdef __DISABLE_AVX512PF__
+#undef __DISABLE_AVX512PF__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512PF__ */
+
+#endif /* _AVX512PFINTRIN_H_INCLUDED */
diff --git a/include-gcc/avx512vbmi2intrin.h b/include-gcc/avx512vbmi2intrin.h
new file mode 100644 (file)
index 0000000..528d193
--- /dev/null
@@ -0,0 +1,557 @@
+/* Copyright (C) 2013-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512vbmi2intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VBMI2INTRIN_H_INCLUDED
+#define __AVX512VBMI2INTRIN_H_INCLUDED
+
+#if !defined(__AVX512VBMI2__)
+#pragma GCC push_options
+#pragma GCC target("avx512vbmi2")
+#define __DISABLE_AVX512VBMI2__
+#endif /* __AVX512VBMI2__ */
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shrdi_epi16 (__m512i __A, __m512i __B, int __C)
+{
+  return (__m512i) __builtin_ia32_vpshrd_v32hi ((__v32hi)__A, (__v32hi) __B,
+                                                                       __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shrdi_epi32 (__m512i __A, __m512i __B, int __C)
+{
+  return (__m512i) __builtin_ia32_vpshrd_v16si ((__v16si)__A, (__v16si) __B,
+                                                                       __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shrdi_epi32 (__m512i __A, __mmask16 __B, __m512i __C, __m512i __D,
+                                                               int __E)
+{
+  return (__m512i)__builtin_ia32_vpshrd_v16si_mask ((__v16si)__C,
+                       (__v16si) __D, __E, (__v16si) __A, (__mmask16)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shrdi_epi32 (__mmask16 __A, __m512i __B, __m512i __C, int __D)
+{
+  return (__m512i)__builtin_ia32_vpshrd_v16si_mask ((__v16si)__B,
+       (__v16si) __C, __D, (__v16si) _mm512_setzero_si512 (), (__mmask16)__A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shrdi_epi64 (__m512i __A, __m512i __B, int __C)
+{
+  return (__m512i) __builtin_ia32_vpshrd_v8di ((__v8di)__A, (__v8di) __B, __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shrdi_epi64 (__m512i __A, __mmask8 __B, __m512i __C, __m512i __D,
+                                                               int __E)
+{
+  return (__m512i)__builtin_ia32_vpshrd_v8di_mask ((__v8di)__C, (__v8di) __D,
+                                       __E, (__v8di) __A, (__mmask8)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shrdi_epi64 (__mmask8 __A, __m512i __B, __m512i __C, int __D)
+{
+  return (__m512i)__builtin_ia32_vpshrd_v8di_mask ((__v8di)__B, (__v8di) __C,
+                       __D, (__v8di) _mm512_setzero_si512 (), (__mmask8)__A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shldi_epi16 (__m512i __A, __m512i __B, int __C)
+{
+  return (__m512i) __builtin_ia32_vpshld_v32hi ((__v32hi)__A, (__v32hi) __B,
+                                                                       __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shldi_epi32 (__m512i __A, __m512i __B, int __C)
+{
+  return (__m512i) __builtin_ia32_vpshld_v16si ((__v16si)__A, (__v16si) __B,
+                                                                       __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shldi_epi32 (__m512i __A, __mmask16 __B, __m512i __C, __m512i __D,
+                                                               int __E)
+{
+  return (__m512i)__builtin_ia32_vpshld_v16si_mask ((__v16si)__C,
+                       (__v16si) __D, __E, (__v16si) __A, (__mmask16)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shldi_epi32 (__mmask16 __A, __m512i __B, __m512i __C, int __D)
+{
+  return (__m512i)__builtin_ia32_vpshld_v16si_mask ((__v16si)__B,
+       (__v16si) __C, __D, (__v16si) _mm512_setzero_si512 (), (__mmask16)__A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shldi_epi64 (__m512i __A, __m512i __B, int __C)
+{
+  return (__m512i) __builtin_ia32_vpshld_v8di ((__v8di)__A, (__v8di) __B, __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shldi_epi64 (__m512i __A, __mmask8 __B, __m512i __C, __m512i __D,
+                                                               int __E)
+{
+  return (__m512i)__builtin_ia32_vpshld_v8di_mask ((__v8di)__C, (__v8di) __D,
+                                       __E, (__v8di) __A, (__mmask8)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shldi_epi64 (__mmask8 __A, __m512i __B, __m512i __C, int __D)
+{
+  return (__m512i)__builtin_ia32_vpshld_v8di_mask ((__v8di)__B, (__v8di) __C,
+                       __D, (__v8di) _mm512_setzero_si512 (), (__mmask8)__A);
+}
+#else
+#define _mm512_shrdi_epi16(A, B, C) \
+  ((__m512i) __builtin_ia32_vpshrd_v32hi ((__v32hi)(__m512i)(A), \
+                                         (__v32hi)(__m512i)(B),(int)(C)))
+#define _mm512_shrdi_epi32(A, B, C) \
+  ((__m512i) __builtin_ia32_vpshrd_v16si ((__v16si)(__m512i)(A), \
+                                         (__v16si)(__m512i)(B),(int)(C)))
+#define _mm512_mask_shrdi_epi32(A, B, C, D, E) \
+  ((__m512i) __builtin_ia32_vpshrd_v16si_mask ((__v16si)(__m512i)(C), \
+                                              (__v16si)(__m512i)(D), \
+                                              (int)(E),                \
+                                              (__v16si)(__m512i)(A),   \
+                                              (__mmask16)(B)))
+#define _mm512_maskz_shrdi_epi32(A, B, C, D) \
+  ((__m512i) \
+   __builtin_ia32_vpshrd_v16si_mask ((__v16si)(__m512i)(B),            \
+                                    (__v16si)(__m512i)(C),(int)(D),    \
+                                    (__v16si)(__m512i)_mm512_setzero_si512 (), \
+                                    (__mmask16)(A)))
+#define _mm512_shrdi_epi64(A, B, C) \
+  ((__m512i) __builtin_ia32_vpshrd_v8di ((__v8di)(__m512i)(A), \
+                                        (__v8di)(__m512i)(B),(int)(C)))
+#define _mm512_mask_shrdi_epi64(A, B, C, D, E) \
+  ((__m512i) __builtin_ia32_vpshrd_v8di_mask ((__v8di)(__m512i)(C), \
+                                             (__v8di)(__m512i)(D), (int)(E), \
+                                             (__v8di)(__m512i)(A), \
+                                             (__mmask8)(B)))
+#define _mm512_maskz_shrdi_epi64(A, B, C, D) \
+  ((__m512i) \
+   __builtin_ia32_vpshrd_v8di_mask ((__v8di)(__m512i)(B),              \
+                                   (__v8di)(__m512i)(C),(int)(D),      \
+                                   (__v8di)(__m512i)_mm512_setzero_si512 (), \
+                                   (__mmask8)(A)))
+#define _mm512_shldi_epi16(A, B, C) \
+  ((__m512i) __builtin_ia32_vpshld_v32hi ((__v32hi)(__m512i)(A), \
+                                         (__v32hi)(__m512i)(B),(int)(C)))
+#define _mm512_shldi_epi32(A, B, C) \
+  ((__m512i) __builtin_ia32_vpshld_v16si ((__v16si)(__m512i)(A),       \
+                                         (__v16si)(__m512i)(B),(int)(C)))
+#define _mm512_mask_shldi_epi32(A, B, C, D, E) \
+  ((__m512i) __builtin_ia32_vpshld_v16si_mask ((__v16si)(__m512i)(C), \
+                                              (__v16si)(__m512i)(D), \
+                                              (int)(E),                \
+                                              (__v16si)(__m512i)(A), \
+                                              (__mmask16)(B)))
+#define _mm512_maskz_shldi_epi32(A, B, C, D) \
+  ((__m512i) \
+   __builtin_ia32_vpshld_v16si_mask ((__v16si)(__m512i)(B),            \
+                                    (__v16si)(__m512i)(C),(int)(D),    \
+                                    (__v16si)(__m512i)_mm512_setzero_si512 (), \
+                                    (__mmask16)(A)))
+#define _mm512_shldi_epi64(A, B, C) \
+  ((__m512i) __builtin_ia32_vpshld_v8di ((__v8di)(__m512i)(A), \
+                                        (__v8di)(__m512i)(B), (int)(C)))
+#define _mm512_mask_shldi_epi64(A, B, C, D, E) \
+  ((__m512i) __builtin_ia32_vpshld_v8di_mask ((__v8di)(__m512i)(C), \
+                                             (__v8di)(__m512i)(D), (int)(E), \
+                                             (__v8di)(__m512i)(A), \
+                                             (__mmask8)(B)))
+#define _mm512_maskz_shldi_epi64(A, B, C, D) \
+  ((__m512i) \
+   __builtin_ia32_vpshld_v8di_mask ((__v8di)(__m512i)(B),              \
+                                   (__v8di)(__m512i)(C),(int)(D),      \
+                                   (__v8di)(__m512i)_mm512_setzero_si512 (), \
+                                   (__mmask8)(A)))
+#endif
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shrdv_epi16 (__m512i __A, __m512i __B, __m512i __C)
+{
+  return (__m512i) __builtin_ia32_vpshrdv_v32hi ((__v32hi)__A, (__v32hi) __B,
+                                                               (__v32hi) __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shrdv_epi32 (__m512i __A, __m512i __B, __m512i __C)
+{
+  return (__m512i) __builtin_ia32_vpshrdv_v16si ((__v16si)__A, (__v16si) __B,
+                                                               (__v16si) __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shrdv_epi32 (__m512i __A, __mmask16 __B, __m512i __C, __m512i __D)
+{
+  return (__m512i)__builtin_ia32_vpshrdv_v16si_mask ((__v16si)__A,
+                               (__v16si) __C, (__v16si) __D, (__mmask16)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shrdv_epi32 (__mmask16 __A, __m512i __B, __m512i __C, __m512i __D)
+{
+  return (__m512i)__builtin_ia32_vpshrdv_v16si_maskz ((__v16si)__B,
+                               (__v16si) __C, (__v16si) __D, (__mmask16)__A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shrdv_epi64 (__m512i __A, __m512i __B, __m512i __C)
+{
+  return (__m512i) __builtin_ia32_vpshrdv_v8di ((__v8di)__A, (__v8di) __B,
+                                                               (__v8di) __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shrdv_epi64 (__m512i __A, __mmask8 __B, __m512i __C, __m512i __D)
+{
+  return (__m512i)__builtin_ia32_vpshrdv_v8di_mask ((__v8di)__A, (__v8di) __C,
+                                               (__v8di) __D, (__mmask8)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shrdv_epi64 (__mmask8 __A, __m512i __B, __m512i __C, __m512i __D)
+{
+  return (__m512i)__builtin_ia32_vpshrdv_v8di_maskz ((__v8di)__B, (__v8di) __C,
+                                                (__v8di) __D, (__mmask8)__A);
+}
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shldv_epi16 (__m512i __A, __m512i __B, __m512i __C)
+{
+  return (__m512i) __builtin_ia32_vpshldv_v32hi ((__v32hi)__A, (__v32hi) __B,
+                                                        (__v32hi) __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shldv_epi32 (__m512i __A, __m512i __B, __m512i __C)
+{
+  return (__m512i) __builtin_ia32_vpshldv_v16si ((__v16si)__A, (__v16si) __B,
+                                                               (__v16si) __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shldv_epi32 (__m512i __A, __mmask16 __B, __m512i __C, __m512i __D)
+{
+  return (__m512i)__builtin_ia32_vpshldv_v16si_mask ((__v16si)__A,
+                               (__v16si) __C, (__v16si) __D, (__mmask16)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shldv_epi32 (__mmask16 __A, __m512i __B, __m512i __C, __m512i __D)
+{
+  return (__m512i)__builtin_ia32_vpshldv_v16si_maskz ((__v16si)__B,
+                               (__v16si) __C, (__v16si) __D, (__mmask16)__A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_shldv_epi64 (__m512i __A, __m512i __B, __m512i __C)
+{
+  return (__m512i) __builtin_ia32_vpshldv_v8di ((__v8di)__A, (__v8di) __B,
+                                                               (__v8di) __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shldv_epi64 (__m512i __A, __mmask8 __B, __m512i __C, __m512i __D)
+{
+  return (__m512i)__builtin_ia32_vpshldv_v8di_mask ((__v8di)__A, (__v8di) __C,
+                                               (__v8di) __D, (__mmask8)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shldv_epi64 (__mmask8 __A, __m512i __B, __m512i __C, __m512i __D)
+{
+  return (__m512i)__builtin_ia32_vpshldv_v8di_maskz ((__v8di)__B, (__v8di) __C,
+                                               (__v8di) __D, (__mmask8)__A);
+}
+
+#ifdef __DISABLE_AVX512VBMI2__
+#undef __DISABLE_AVX512VBMI2__
+
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512VBMI2__ */
+
+#if !defined(__AVX512VBMI2__) || !defined(__AVX512BW__)
+#pragma GCC push_options
+#pragma GCC target("avx512vbmi2,avx512bw")
+#define __DISABLE_AVX512VBMI2BW__
+#endif /* __AVX512VBMI2BW__ */
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_compress_epi8 (__m512i __A, __mmask64 __B, __m512i __C)
+{
+  return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi)__C,
+                                               (__v64qi)__A, (__mmask64)__B);
+}
+
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_compress_epi8 (__mmask64 __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi)__B,
+                       (__v64qi)_mm512_setzero_si512 (), (__mmask64)__A);
+}
+
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_compressstoreu_epi8 (void * __A, __mmask64 __B, __m512i __C)
+{
+  __builtin_ia32_compressstoreuqi512_mask ((__v64qi *) __A, (__v64qi) __C,
+                                                       (__mmask64) __B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_compress_epi16 (__m512i __A, __mmask32 __B, __m512i __C)
+{
+  return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi)__C,
+                                               (__v32hi)__A, (__mmask32)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_compress_epi16 (__mmask32 __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi)__B,
+                       (__v32hi)_mm512_setzero_si512 (), (__mmask32)__A);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_compressstoreu_epi16 (void * __A, __mmask32 __B, __m512i __C)
+{
+  __builtin_ia32_compressstoreuhi512_mask ((__v32hi *) __A, (__v32hi) __C,
+                                                       (__mmask32) __B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_expand_epi8 (__m512i __A, __mmask64 __B, __m512i __C)
+{
+  return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __C,
+                                                   (__v64qi) __A,
+                                                   (__mmask64) __B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_expand_epi8 (__mmask64 __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_expandqi512_maskz ((__v64qi) __B,
+                       (__v64qi) _mm512_setzero_si512 (), (__mmask64) __A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_expandloadu_epi8 (__m512i __A, __mmask64 __B, const void * __C)
+{
+  return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *) __C,
+                                       (__v64qi) __A, (__mmask64) __B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_expandloadu_epi8 (__mmask64 __A, const void * __B)
+{
+  return (__m512i) __builtin_ia32_expandloadqi512_maskz ((const __v64qi *) __B,
+                       (__v64qi) _mm512_setzero_si512 (), (__mmask64) __A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_expand_epi16 (__m512i __A, __mmask32 __B, __m512i __C)
+{
+  return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __C,
+                                                   (__v32hi) __A,
+                                                   (__mmask32) __B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_expand_epi16 (__mmask32 __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_expandhi512_maskz ((__v32hi) __B,
+                       (__v32hi) _mm512_setzero_si512 (), (__mmask32) __A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_expandloadu_epi16 (__m512i __A, __mmask32 __B, const void * __C)
+{
+  return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *) __C,
+                                       (__v32hi) __A, (__mmask32) __B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_expandloadu_epi16 (__mmask32 __A, const void * __B)
+{
+  return (__m512i) __builtin_ia32_expandloadhi512_maskz ((const __v32hi *) __B,
+                       (__v32hi) _mm512_setzero_si512 (), (__mmask32) __A);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shrdi_epi16 (__m512i __A, __mmask32 __B, __m512i __C, __m512i __D,
+                                                               int __E)
+{
+  return (__m512i)__builtin_ia32_vpshrd_v32hi_mask ((__v32hi)__C,
+                       (__v32hi) __D, __E, (__v32hi) __A, (__mmask32)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shrdi_epi16 (__mmask32 __A, __m512i __B, __m512i __C, int __D)
+{
+  return (__m512i)__builtin_ia32_vpshrd_v32hi_mask ((__v32hi)__B,
+       (__v32hi) __C, __D, (__v32hi) _mm512_setzero_si512 (), (__mmask32)__A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shldi_epi16 (__m512i __A, __mmask32 __B, __m512i __C, __m512i __D,
+                                                               int __E)
+{
+  return (__m512i)__builtin_ia32_vpshld_v32hi_mask ((__v32hi)__C,
+                       (__v32hi) __D, __E, (__v32hi) __A, (__mmask32)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shldi_epi16 (__mmask32 __A, __m512i __B, __m512i __C, int __D)
+{
+  return (__m512i)__builtin_ia32_vpshld_v32hi_mask ((__v32hi)__B,
+       (__v32hi) __C, __D, (__v32hi) _mm512_setzero_si512 (), (__mmask32)__A);
+}
+
+#else
+#define _mm512_mask_shrdi_epi16(A, B, C, D, E) \
+  ((__m512i) __builtin_ia32_vpshrd_v32hi_mask ((__v32hi)(__m512i)(C), \
+                                              (__v32hi)(__m512i)(D), \
+                                              (int)(E),                \
+                                              (__v32hi)(__m512i)(A),   \
+                                              (__mmask32)(B)))
+#define _mm512_maskz_shrdi_epi16(A, B, C, D) \
+  ((__m512i) \
+   __builtin_ia32_vpshrd_v32hi_mask ((__v32hi)(__m512i)(B),            \
+                                    (__v32hi)(__m512i)(C),(int)(D),    \
+                                    (__v32hi)(__m512i)_mm512_setzero_si512 (), \
+                                    (__mmask32)(A)))
+#define _mm512_mask_shldi_epi16(A, B, C, D, E) \
+  ((__m512i) __builtin_ia32_vpshld_v32hi_mask ((__v32hi)(__m512i)(C), \
+                                              (__v32hi)(__m512i)(D), \
+                                              (int)(E), \
+                                              (__v32hi)(__m512i)(A),   \
+                                              (__mmask32)(B)))
+#define _mm512_maskz_shldi_epi16(A, B, C, D) \
+  ((__m512i) \
+   __builtin_ia32_vpshld_v32hi_mask ((__v32hi)(__m512i)(B),            \
+                                    (__v32hi)(__m512i)(C),(int)(D),    \
+                                    (__v32hi)(__m512i)_mm512_setzero_si512 (), \
+                                    (__mmask32)(A)))
+#endif
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shrdv_epi16 (__m512i __A, __mmask32 __B, __m512i __C, __m512i __D)
+{
+  return (__m512i)__builtin_ia32_vpshrdv_v32hi_mask ((__v32hi)__A,
+                               (__v32hi) __C, (__v32hi) __D, (__mmask32)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shrdv_epi16 (__mmask32 __A, __m512i __B, __m512i __C, __m512i __D)
+{
+  return (__m512i)__builtin_ia32_vpshrdv_v32hi_maskz ((__v32hi)__B,
+                               (__v32hi) __C, (__v32hi) __D, (__mmask32)__A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_shldv_epi16 (__m512i __A, __mmask32 __B, __m512i __C, __m512i __D)
+{
+  return (__m512i)__builtin_ia32_vpshldv_v32hi_mask ((__v32hi)__A,
+                               (__v32hi) __C, (__v32hi) __D, (__mmask32)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_shldv_epi16 (__mmask32 __A, __m512i __B, __m512i __C, __m512i __D)
+{
+  return (__m512i)__builtin_ia32_vpshldv_v32hi_maskz ((__v32hi)__B,
+                               (__v32hi) __C, (__v32hi) __D, (__mmask32)__A);
+}
+
+#ifdef __DISABLE_AVX512VBMI2BW__
+#undef __DISABLE_AVX512VBMI2BW__
+
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512VBMI2BW__ */
+
+#endif /* __AVX512VBMI2INTRIN_H_INCLUDED */
diff --git a/include-gcc/avx512vbmi2vlintrin.h b/include-gcc/avx512vbmi2vlintrin.h
new file mode 100644 (file)
index 0000000..86efca2
--- /dev/null
@@ -0,0 +1,1037 @@
+/* Copyright (C) 2013-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512vbmi2vlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512VBMI2VLINTRIN_H_INCLUDED
+#define _AVX512VBMI2VLINTRIN_H_INCLUDED
+
+#if !defined(__AVX512VL__) || !defined(__AVX512VBMI2__)
+#pragma GCC push_options
+#pragma GCC target("avx512vbmi2,avx512vl")
+#define __DISABLE_AVX512VBMI2VL__
+#endif /* __AVX512VBMIVL__ */
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_compress_epi8 (__m128i __A, __mmask16 __B, __m128i __C)
+{
+  return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi)__C,
+                                               (__v16qi)__A, (__mmask16)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_compress_epi8 (__mmask16 __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __B,
+                       (__v16qi) _mm_setzero_si128 (), (__mmask16) __A);
+}
+
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_compressstoreu_epi16 (void * __A, __mmask16 __B, __m256i __C)
+{
+  __builtin_ia32_compressstoreuhi256_mask ((__v16hi *) __A, (__v16hi) __C,
+                                                       (__mmask16) __B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_compress_epi16 (__m128i __A, __mmask8 __B, __m128i __C)
+{
+  return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi)__C, (__v8hi)__A,
+                                                               (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_compress_epi16 (__mmask8 __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __B,
+                               (__v8hi) _mm_setzero_si128 (), (__mmask8) __A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_compress_epi16 (__m256i __A, __mmask16 __B, __m256i __C)
+{
+  return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi)__C,
+                                               (__v16hi)__A, (__mmask16)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_compress_epi16 (__mmask16 __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __B,
+                       (__v16hi) _mm256_setzero_si256 (), (__mmask16) __A);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_compressstoreu_epi8 (void * __A, __mmask16 __B, __m128i __C)
+{
+  __builtin_ia32_compressstoreuqi128_mask ((__v16qi *) __A, (__v16qi) __C,
+                                                       (__mmask16) __B);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_compressstoreu_epi16 (void * __A, __mmask8 __B, __m128i __C)
+{
+  __builtin_ia32_compressstoreuhi128_mask ((__v8hi *) __A, (__v8hi) __C,
+                                                       (__mmask8) __B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_expand_epi8 (__m128i __A, __mmask16 __B, __m128i __C)
+{
+  return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __C,
+                                                   (__v16qi) __A,
+                                                   (__mmask16) __B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_expand_epi8 (__mmask16 __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_expandqi128_maskz ((__v16qi) __B,
+                       (__v16qi) _mm_setzero_si128 (), (__mmask16) __A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_expandloadu_epi8 (__m128i __A, __mmask16 __B, const void * __C)
+{
+  return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *) __C,
+                                       (__v16qi) __A, (__mmask16) __B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_expandloadu_epi8 (__mmask16 __A, const void * __B)
+{
+  return (__m128i) __builtin_ia32_expandloadqi128_maskz ((const __v16qi *) __B,
+                       (__v16qi) _mm_setzero_si128 (), (__mmask16) __A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_expand_epi16 (__m128i __A, __mmask8 __B, __m128i __C)
+{
+  return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __C,
+                                                   (__v8hi) __A,
+                                                   (__mmask8) __B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_expand_epi16 (__mmask8 __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_expandhi128_maskz ((__v8hi) __B,
+                               (__v8hi) _mm_setzero_si128 (), (__mmask8) __A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_expandloadu_epi16 (__m128i __A, __mmask8 __B, const void * __C)
+{
+  return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *) __C,
+                                               (__v8hi) __A, (__mmask8) __B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_expandloadu_epi16 (__mmask8 __A, const void * __B)
+{
+  return (__m128i) __builtin_ia32_expandloadhi128_maskz ((const __v8hi *) __B,
+                               (__v8hi) _mm_setzero_si128 (), (__mmask8) __A);
+}
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_expand_epi16 (__m256i __A, __mmask16 __B, __m256i __C)
+{
+  return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __C,
+                                                   (__v16hi) __A,
+                                                   (__mmask16) __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_expand_epi16 (__mmask16 __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_expandhi256_maskz ((__v16hi) __B,
+                       (__v16hi) _mm256_setzero_si256 (), (__mmask16) __A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_expandloadu_epi16 (__m256i __A, __mmask16 __B, const void * __C)
+{
+  return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *) __C,
+                                       (__v16hi) __A, (__mmask16) __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_expandloadu_epi16 (__mmask16 __A, const void * __B)
+{
+  return (__m256i) __builtin_ia32_expandloadhi256_maskz ((const __v16hi *) __B,
+                       (__v16hi) _mm256_setzero_si256 (), (__mmask16) __A);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shrdi_epi16 (__m256i __A, __m256i __B, int __C)
+{
+  return (__m256i) __builtin_ia32_vpshrd_v16hi ((__v16hi)__A, (__v16hi) __B,
+                                                                       __C);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shrdi_epi16 (__m256i __A, __mmask16 __B, __m256i __C, __m256i __D,
+                                                               int __E)
+{
+  return (__m256i)__builtin_ia32_vpshrd_v16hi_mask ((__v16hi)__C,
+                       (__v16hi) __D, __E, (__v16hi) __A, (__mmask16)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shrdi_epi16 (__mmask16 __A, __m256i __B, __m256i __C, int __D)
+{
+  return (__m256i)__builtin_ia32_vpshrd_v16hi_mask ((__v16hi)__B,
+       (__v16hi) __C, __D, (__v16hi) _mm256_setzero_si256 (), (__mmask16)__A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shrdi_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D,
+                                                               int __E)
+{
+  return (__m256i)__builtin_ia32_vpshrd_v8si_mask ((__v8si)__C, (__v8si) __D,
+                                       __E, (__v8si) __A, (__mmask8)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shrdi_epi32 (__mmask8 __A, __m256i __B, __m256i __C, int __D)
+{
+  return (__m256i)__builtin_ia32_vpshrd_v8si_mask ((__v8si)__B, (__v8si) __C,
+                       __D, (__v8si) _mm256_setzero_si256 (), (__mmask8)__A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shrdi_epi32 (__m256i __A, __m256i __B, int __C)
+{
+  return (__m256i) __builtin_ia32_vpshrd_v8si ((__v8si)__A, (__v8si) __B, __C);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shrdi_epi64 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D,
+                                                               int __E)
+{
+  return (__m256i)__builtin_ia32_vpshrd_v4di_mask ((__v4di)__C, (__v4di) __D,
+                                       __E, (__v4di) __A, (__mmask8)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shrdi_epi64 (__mmask8 __A, __m256i __B, __m256i __C, int __D)
+{
+  return (__m256i)__builtin_ia32_vpshrd_v4di_mask ((__v4di)__B, (__v4di) __C,
+                       __D, (__v4di) _mm256_setzero_si256 (), (__mmask8)__A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shrdi_epi64 (__m256i __A, __m256i __B, int __C)
+{
+  return (__m256i) __builtin_ia32_vpshrd_v4di ((__v4di)__A, (__v4di) __B, __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shrdi_epi16 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D,
+                                                               int __E)
+{
+  return (__m128i)__builtin_ia32_vpshrd_v8hi_mask ((__v8hi)__C, (__v8hi) __D,
+                                       __E, (__v8hi) __A, (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shrdi_epi16 (__mmask8 __A, __m128i __B, __m128i __C, int __D)
+{
+  return (__m128i)__builtin_ia32_vpshrd_v8hi_mask ((__v8hi)__B, (__v8hi) __C,
+                       __D, (__v8hi) _mm_setzero_si128 (), (__mmask8)__A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shrdi_epi16 (__m128i __A, __m128i __B, int __C)
+{
+  return (__m128i) __builtin_ia32_vpshrd_v8hi ((__v8hi)__A, (__v8hi) __B, __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shrdi_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D,
+                                                               int __E)
+{
+  return (__m128i)__builtin_ia32_vpshrd_v4si_mask ((__v4si)__C, (__v4si) __D,
+                                       __E, (__v4si) __A, (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shrdi_epi32 (__mmask8 __A, __m128i __B, __m128i __C, int __D)
+{
+  return (__m128i)__builtin_ia32_vpshrd_v4si_mask ((__v4si)__B, (__v4si) __C,
+                       __D, (__v4si) _mm_setzero_si128 (), (__mmask8)__A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shrdi_epi32 (__m128i __A, __m128i __B, int __C)
+{
+  return (__m128i) __builtin_ia32_vpshrd_v4si ((__v4si)__A, (__v4si) __B, __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shrdi_epi64 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D,
+                                                               int __E)
+{
+  return (__m128i)__builtin_ia32_vpshrd_v2di_mask ((__v2di)__C, (__v2di) __D,
+                                       __E, (__v2di) __A, (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shrdi_epi64 (__mmask8 __A, __m128i __B, __m128i __C, int __D)
+{
+  return (__m128i)__builtin_ia32_vpshrd_v2di_mask ((__v2di)__B, (__v2di) __C,
+                       __D, (__v2di) _mm_setzero_si128 (), (__mmask8)__A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shrdi_epi64 (__m128i __A, __m128i __B, int __C)
+{
+  return (__m128i) __builtin_ia32_vpshrd_v2di ((__v2di)__A, (__v2di) __B, __C);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shldi_epi16 (__m256i __A, __m256i __B, int __C)
+{
+  return (__m256i) __builtin_ia32_vpshld_v16hi ((__v16hi)__A, (__v16hi) __B,
+                                                                       __C);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shldi_epi16 (__m256i __A, __mmask16 __B, __m256i __C, __m256i __D,
+                                                               int __E)
+{
+  return (__m256i)__builtin_ia32_vpshld_v16hi_mask ((__v16hi)__C,
+                       (__v16hi) __D, __E, (__v16hi) __A, (__mmask16)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shldi_epi16 (__mmask16 __A, __m256i __B, __m256i __C, int __D)
+{
+  return (__m256i)__builtin_ia32_vpshld_v16hi_mask ((__v16hi)__B,
+       (__v16hi) __C, __D, (__v16hi) _mm256_setzero_si256 (), (__mmask16)__A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shldi_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D,
+                                                               int __E)
+{
+  return (__m256i)__builtin_ia32_vpshld_v8si_mask ((__v8si)__C, (__v8si) __D,
+                                       __E, (__v8si) __A, (__mmask8)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shldi_epi32 (__mmask8 __A, __m256i __B, __m256i __C, int __D)
+{
+  return (__m256i)__builtin_ia32_vpshld_v8si_mask ((__v8si)__B, (__v8si) __C,
+                       __D, (__v8si) _mm256_setzero_si256 (), (__mmask8)__A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shldi_epi32 (__m256i __A, __m256i __B, int __C)
+{
+  return (__m256i) __builtin_ia32_vpshld_v8si ((__v8si)__A, (__v8si) __B, __C);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shldi_epi64 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D,
+                                                               int __E)
+{
+  return (__m256i)__builtin_ia32_vpshld_v4di_mask ((__v4di)__C, (__v4di) __D,
+                                       __E, (__v4di) __A, (__mmask8)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shldi_epi64 (__mmask8 __A, __m256i __B, __m256i __C, int __D)
+{
+  return (__m256i)__builtin_ia32_vpshld_v4di_mask ((__v4di)__B, (__v4di) __C,
+                       __D, (__v4di) _mm256_setzero_si256 (), (__mmask8)__A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shldi_epi64 (__m256i __A, __m256i __B, int __C)
+{
+  return (__m256i) __builtin_ia32_vpshld_v4di ((__v4di)__A, (__v4di) __B, __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shldi_epi16 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D,
+                                                               int __E)
+{
+  return (__m128i)__builtin_ia32_vpshld_v8hi_mask ((__v8hi)__C, (__v8hi) __D,
+                                       __E, (__v8hi) __A, (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shldi_epi16 (__mmask8 __A, __m128i __B, __m128i __C, int __D)
+{
+  return (__m128i)__builtin_ia32_vpshld_v8hi_mask ((__v8hi)__B, (__v8hi) __C,
+                       __D, (__v8hi) _mm_setzero_si128 (), (__mmask8)__A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shldi_epi16 (__m128i __A, __m128i __B, int __C)
+{
+  return (__m128i) __builtin_ia32_vpshld_v8hi ((__v8hi)__A, (__v8hi) __B, __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shldi_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D,
+                                                               int __E)
+{
+  return (__m128i)__builtin_ia32_vpshld_v4si_mask ((__v4si)__C, (__v4si) __D,
+                                       __E, (__v4si) __A, (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shldi_epi32 (__mmask8 __A, __m128i __B, __m128i __C, int __D)
+{
+  return (__m128i)__builtin_ia32_vpshld_v4si_mask ((__v4si)__B, (__v4si) __C,
+                       __D, (__v4si) _mm_setzero_si128 (), (__mmask8)__A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shldi_epi32 (__m128i __A, __m128i __B, int __C)
+{
+  return (__m128i) __builtin_ia32_vpshld_v4si ((__v4si)__A, (__v4si) __B, __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shldi_epi64 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D,
+                                                               int __E)
+{
+  return (__m128i)__builtin_ia32_vpshld_v2di_mask ((__v2di)__C, (__v2di) __D,
+                                       __E, (__v2di) __A, (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shldi_epi64 (__mmask8 __A, __m128i __B, __m128i __C, int __D)
+{
+  return (__m128i)__builtin_ia32_vpshld_v2di_mask ((__v2di)__B, (__v2di) __C,
+                       __D, (__v2di) _mm_setzero_si128 (), (__mmask8)__A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shldi_epi64 (__m128i __A, __m128i __B, int __C)
+{
+  return (__m128i) __builtin_ia32_vpshld_v2di ((__v2di)__A, (__v2di) __B, __C);
+}
+#else
+#define _mm256_shrdi_epi16(A, B, C) \
+  ((__m256i) __builtin_ia32_vpshrd_v16hi ((__v16hi)(__m256i)(A), \
+                                         (__v16hi)(__m256i)(B),(int)(C)))
+#define _mm256_mask_shrdi_epi16(A, B, C, D, E) \
+  ((__m256i) __builtin_ia32_vpshrd_v16hi_mask ((__v16hi)(__m256i)(C), \
+                                              (__v16hi)(__m256i)(D), \
+                                              (int)(E),                \
+                                              (__v16hi)(__m256i)(A), \
+                                              (__mmask16)(B)))
+#define _mm256_maskz_shrdi_epi16(A, B, C, D) \
+  ((__m256i) \
+   __builtin_ia32_vpshrd_v16hi_mask ((__v16hi)(__m256i)(B),            \
+                                    (__v16hi)(__m256i)(C),(int)(D),    \
+                                    (__v16hi)(__m256i)_mm256_setzero_si256 (), \
+                                    (__mmask16)(A)))
+#define _mm256_shrdi_epi32(A, B, C) \
+  ((__m256i) __builtin_ia32_vpshrd_v8si ((__v8si)(__m256i)(A), \
+                                        (__v8si)(__m256i)(B),(int)(C)))
+#define _mm256_mask_shrdi_epi32(A, B, C, D, E) \
+  ((__m256i) __builtin_ia32_vpshrd_v8si_mask ((__v8si)(__m256i)(C), \
+                                             (__v8si)(__m256i)(D), \
+                                             (int)(E), \
+                                             (__v8si)(__m256i)(A), \
+                                             (__mmask8)(B)))
+#define _mm256_maskz_shrdi_epi32(A, B, C, D) \
+  ((__m256i) \
+   __builtin_ia32_vpshrd_v8si_mask ((__v8si)(__m256i)(B),              \
+                                   (__v8si)(__m256i)(C),(int)(D),      \
+                                   (__v8si)(__m256i)_mm256_setzero_si256 (), \
+                                   (__mmask8)(A)))
+#define _mm256_shrdi_epi64(A, B, C) \
+  ((__m256i) __builtin_ia32_vpshrd_v4di ((__v4di)(__m256i)(A), \
+                                        (__v4di)(__m256i)(B),(int)(C)))
+#define _mm256_mask_shrdi_epi64(A, B, C, D, E) \
+  ((__m256i) __builtin_ia32_vpshrd_v4di_mask ((__v4di)(__m256i)(C), \
+                                             (__v4di)(__m256i)(D), (int)(E), \
+                                             (__v4di)(__m256i)(A), \
+                                             (__mmask8)(B)))
+#define _mm256_maskz_shrdi_epi64(A, B, C, D) \
+  ((__m256i) \
+   __builtin_ia32_vpshrd_v4di_mask ((__v4di)(__m256i)(B),              \
+                                   (__v4di)(__m256i)(C),(int)(D),      \
+                                   (__v4di)(__m256i)_mm256_setzero_si256 (), \
+                                   (__mmask8)(A)))
+#define _mm_shrdi_epi16(A, B, C) \
+  ((__m128i) __builtin_ia32_vpshrd_v8hi ((__v8hi)(__m128i)(A), \
+                                        (__v8hi)(__m128i)(B),(int)(C)))
+#define _mm_mask_shrdi_epi16(A, B, C, D, E) \
+  ((__m128i) __builtin_ia32_vpshrd_v8hi_mask ((__v8hi)(__m128i)(C), \
+                                             (__v8hi)(__m128i)(D), (int)(E), \
+                                             (__v8hi)(__m128i)(A), \
+                                             (__mmask8)(B)))
+#define _mm_maskz_shrdi_epi16(A, B, C, D) \
+  ((__m128i) \
+   __builtin_ia32_vpshrd_v8hi_mask ((__v8hi)(__m128i)(B),              \
+                                   (__v8hi)(__m128i)(C),(int)(D),      \
+                                   (__v8hi)(__m128i)_mm_setzero_si128 (), \
+                                   (__mmask8)(A)))
+#define _mm_shrdi_epi32(A, B, C) \
+  ((__m128i) __builtin_ia32_vpshrd_v4si ((__v4si)(__m128i)(A), \
+                                        (__v4si)(__m128i)(B),(int)(C)))
+#define _mm_mask_shrdi_epi32(A, B, C, D, E) \
+  ((__m128i) __builtin_ia32_vpshrd_v4si_mask ((__v4si)(__m128i)(C),    \
+                                             (__v4si)(__m128i)(D), (int)(E), \
+                                             (__v4si)(__m128i)(A), \
+                                             (__mmask8)(B)))
+#define _mm_maskz_shrdi_epi32(A, B, C, D) \
+  ((__m128i) \
+   __builtin_ia32_vpshrd_v4si_mask ((__v4si)(__m128i)(B),              \
+                                   (__v4si)(__m128i)(C),(int)(D),      \
+                                   (__v4si)(__m128i)_mm_setzero_si128 (), \
+                                   (__mmask8)(A)))
+#define _mm_shrdi_epi64(A, B, C) \
+  ((__m128i) __builtin_ia32_vpshrd_v2di ((__v2di)(__m128i)(A), \
+                                        (__v2di)(__m128i)(B),(int)(C)))
+#define _mm_mask_shrdi_epi64(A, B, C, D, E) \
+  ((__m128i) __builtin_ia32_vpshrd_v2di_mask ((__v2di)(__m128i)(C), \
+                                             (__v2di)(__m128i)(D), (int)(E), \
+                                             (__v2di)(__m128i)(A), \
+                                             (__mmask8)(B)))
+#define _mm_maskz_shrdi_epi64(A, B, C, D) \
+  ((__m128i) \
+   __builtin_ia32_vpshrd_v2di_mask ((__v2di)(__m128i)(B),              \
+                                   (__v2di)(__m128i)(C),(int)(D),      \
+                                   (__v2di)(__m128i)_mm_setzero_si128 (), \
+                                   (__mmask8)(A)))
+#define _mm256_shldi_epi16(A, B, C) \
+  ((__m256i) __builtin_ia32_vpshld_v16hi ((__v16hi)(__m256i)(A), \
+                                         (__v16hi)(__m256i)(B),(int)(C)))
+#define _mm256_mask_shldi_epi16(A, B, C, D, E) \
+  ((__m256i) __builtin_ia32_vpshld_v16hi_mask ((__v16hi)(__m256i)(C), \
+                                              (__v16hi)(__m256i)(D), \
+                                              (int)(E),                \
+                                              (__v16hi)(__m256i)(A), \
+                                              (__mmask16)(B)))
+#define _mm256_maskz_shldi_epi16(A, B, C, D) \
+  ((__m256i) \
+   __builtin_ia32_vpshld_v16hi_mask ((__v16hi)(__m256i)(B),            \
+                                    (__v16hi)(__m256i)(C),(int)(D),    \
+                                    (__v16hi)(__m256i)_mm256_setzero_si256 (), \
+                                    (__mmask16)(A)))
+#define _mm256_shldi_epi32(A, B, C) \
+  ((__m256i) __builtin_ia32_vpshld_v8si ((__v8si)(__m256i)(A), \
+                                        (__v8si)(__m256i)(B),(int)(C)))
+#define _mm256_mask_shldi_epi32(A, B, C, D, E) \
+  ((__m256i) __builtin_ia32_vpshld_v8si_mask ((__v8si)(__m256i)(C), \
+                                             (__v8si)(__m256i)(D), (int)(E), \
+                                             (__v8si)(__m256i)(A), \
+                                             (__mmask8)(B)))
+#define _mm256_maskz_shldi_epi32(A, B, C, D) \
+  ((__m256i) \
+   __builtin_ia32_vpshld_v8si_mask ((__v8si)(__m256i)(B),              \
+                                   (__v8si)(__m256i)(C),(int)(D),      \
+                                   (__v8si)(__m256i)_mm256_setzero_si256 (), \
+                                   (__mmask8)(A)))
+#define _mm256_shldi_epi64(A, B, C) \
+  ((__m256i) __builtin_ia32_vpshld_v4di ((__v4di)(__m256i)(A), \
+                                        (__v4di)(__m256i)(B),(int)(C)))
+#define _mm256_mask_shldi_epi64(A, B, C, D, E) \
+  ((__m256i) __builtin_ia32_vpshld_v4di_mask ((__v4di)(__m256i)(C), \
+                                             (__v4di)(__m256i)(D), (int)(E), \
+                                             (__v4di)(__m256i)(A), \
+                                             (__mmask8)(B)))
+#define _mm256_maskz_shldi_epi64(A, B, C, D) \
+  ((__m256i) \
+   __builtin_ia32_vpshld_v4di_mask ((__v4di)(__m256i)(B),              \
+                                   (__v4di)(__m256i)(C),(int)(D),      \
+                                   (__v4di)(__m256i)_mm256_setzero_si256 (), \
+                                   (__mmask8)(A)))
+#define _mm_shldi_epi16(A, B, C) \
+  ((__m128i) __builtin_ia32_vpshld_v8hi ((__v8hi)(__m128i)(A), \
+                                        (__v8hi)(__m128i)(B),(int)(C)))
+#define _mm_mask_shldi_epi16(A, B, C, D, E) \
+  ((__m128i) __builtin_ia32_vpshld_v8hi_mask ((__v8hi)(__m128i)(C), \
+                                             (__v8hi)(__m128i)(D), (int)(E), \
+                                             (__v8hi)(__m128i)(A), \
+                                             (__mmask8)(B)))
+#define _mm_maskz_shldi_epi16(A, B, C, D) \
+  ((__m128i) \
+   __builtin_ia32_vpshld_v8hi_mask ((__v8hi)(__m128i)(B),              \
+                                   (__v8hi)(__m128i)(C),(int)(D),      \
+                                   (__v8hi)(__m128i)_mm_setzero_si128 (), \
+                                   (__mmask8)(A)))
+#define _mm_shldi_epi32(A, B, C) \
+  ((__m128i) __builtin_ia32_vpshld_v4si ((__v4si)(__m128i)(A), \
+                                        (__v4si)(__m128i)(B),(int)(C)))
+#define _mm_mask_shldi_epi32(A, B, C, D, E) \
+  ((__m128i) __builtin_ia32_vpshld_v4si_mask ((__v4si)(__m128i)(C), \
+                                             (__v4si)(__m128i)(D), (int)(E), \
+                                             (__v4si)(__m128i)(A), \
+                                             (__mmask8)(B)))
+#define _mm_maskz_shldi_epi32(A, B, C, D) \
+  ((__m128i) \
+   __builtin_ia32_vpshld_v4si_mask ((__v4si)(__m128i)(B),              \
+                                   (__v4si)(__m128i)(C),(int)(D),      \
+                                   (__v4si)(__m128i)_mm_setzero_si128 (), \
+                                   (__mmask8)(A)))
+#define _mm_shldi_epi64(A, B, C) \
+  ((__m128i) __builtin_ia32_vpshld_v2di ((__v2di)(__m128i)(A), \
+                                        (__v2di)(__m128i)(B),(int)(C)))
+#define _mm_mask_shldi_epi64(A, B, C, D, E) \
+  ((__m128i) __builtin_ia32_vpshld_v2di_mask ((__v2di)(__m128i)(C), \
+                                             (__v2di)(__m128i)(D), (int)(E), \
+                                             (__v2di)(__m128i)(A), \
+                                             (__mmask8)(B)))
+#define _mm_maskz_shldi_epi64(A, B, C, D) \
+  ((__m128i) \
+   __builtin_ia32_vpshld_v2di_mask ((__v2di)(__m128i)(B),              \
+                                   (__v2di)(__m128i)(C),(int)(D),      \
+                                   (__v2di)(__m128i)_mm_setzero_si128 (), \
+                                   (__mmask8)(A)))
+#endif
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shrdv_epi16 (__m256i __A, __m256i __B, __m256i __C)
+{
+  return (__m256i) __builtin_ia32_vpshrdv_v16hi ((__v16hi)__A, (__v16hi) __B,
+                                                               (__v16hi) __C);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shrdv_epi16 (__m256i __A, __mmask16 __B, __m256i __C, __m256i __D)
+{
+  return (__m256i)__builtin_ia32_vpshrdv_v16hi_mask ((__v16hi)__A,
+                               (__v16hi) __C, (__v16hi) __D, (__mmask16)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shrdv_epi16 (__mmask16 __A, __m256i __B, __m256i __C, __m256i __D)
+{
+  return (__m256i)__builtin_ia32_vpshrdv_v16hi_maskz ((__v16hi)__B,
+                               (__v16hi) __C, (__v16hi) __D, (__mmask16)__A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shrdv_epi32 (__m256i __A, __m256i __B, __m256i __C)
+{
+  return (__m256i) __builtin_ia32_vpshrdv_v8si ((__v8si)__A, (__v8si) __B,
+                                                               (__v8si) __C);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shrdv_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D)
+{
+  return (__m256i)__builtin_ia32_vpshrdv_v8si_mask ((__v8si)__A, (__v8si) __C,
+                                               (__v8si) __D, (__mmask8)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shrdv_epi32 (__mmask8 __A, __m256i __B, __m256i __C, __m256i __D)
+{
+  return (__m256i)__builtin_ia32_vpshrdv_v8si_maskz ((__v8si)__B, (__v8si) __C,
+                                                (__v8si) __D, (__mmask8)__A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shrdv_epi64 (__m256i __A, __m256i __B, __m256i __C)
+{
+  return (__m256i) __builtin_ia32_vpshrdv_v4di ((__v4di)__A, (__v4di) __B,
+                                                               (__v4di) __C);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shrdv_epi64 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D)
+{
+  return (__m256i)__builtin_ia32_vpshrdv_v4di_mask ((__v4di)__A, (__v4di) __C,
+                                               (__v4di) __D, (__mmask8)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shrdv_epi64 (__mmask8 __A, __m256i __B, __m256i __C, __m256i __D)
+{
+  return (__m256i)__builtin_ia32_vpshrdv_v4di_maskz ((__v4di)__B, (__v4di) __C,
+                                                (__v4di) __D, (__mmask8)__A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shrdv_epi16 (__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i) __builtin_ia32_vpshrdv_v8hi ((__v8hi)__A, (__v8hi) __B,
+                                                               (__v8hi) __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shrdv_epi16 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D)
+{
+  return (__m128i)__builtin_ia32_vpshrdv_v8hi_mask ((__v8hi)__A, (__v8hi) __C,
+                                               (__v8hi) __D, (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shrdv_epi16 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D)
+{
+  return (__m128i)__builtin_ia32_vpshrdv_v8hi_maskz ((__v8hi)__B, (__v8hi) __C,
+                                                (__v8hi) __D, (__mmask8)__A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shrdv_epi32 (__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i) __builtin_ia32_vpshrdv_v4si ((__v4si)__A, (__v4si) __B,
+                                                               (__v4si) __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shrdv_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D)
+{
+  return (__m128i)__builtin_ia32_vpshrdv_v4si_mask ((__v4si)__A, (__v4si) __C,
+                                               (__v4si) __D, (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shrdv_epi32 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D)
+{
+  return (__m128i)__builtin_ia32_vpshrdv_v4si_maskz ((__v4si)__B, (__v4si) __C,
+                                                (__v4si) __D, (__mmask8)__A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shrdv_epi64 (__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i) __builtin_ia32_vpshrdv_v2di ((__v2di)__A, (__v2di) __B,
+                                                               (__v2di) __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shrdv_epi64 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D)
+{
+  return (__m128i)__builtin_ia32_vpshrdv_v2di_mask ((__v2di)__A, (__v2di) __C,
+                                               (__v2di) __D, (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shrdv_epi64 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D)
+{
+  return (__m128i)__builtin_ia32_vpshrdv_v2di_maskz ((__v2di)__B, (__v2di) __C,
+                                                (__v2di) __D, (__mmask8)__A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shldv_epi16 (__m256i __A, __m256i __B, __m256i __C)
+{
+  return (__m256i) __builtin_ia32_vpshldv_v16hi ((__v16hi)__A, (__v16hi) __B,
+                                                               (__v16hi) __C);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shldv_epi16 (__m256i __A, __mmask16 __B, __m256i __C, __m256i __D)
+{
+  return (__m256i)__builtin_ia32_vpshldv_v16hi_mask ((__v16hi)__A,
+                               (__v16hi) __C, (__v16hi) __D, (__mmask16)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shldv_epi16 (__mmask16 __A, __m256i __B, __m256i __C, __m256i __D)
+{
+  return (__m256i)__builtin_ia32_vpshldv_v16hi_maskz ((__v16hi)__B,
+                               (__v16hi) __C, (__v16hi) __D, (__mmask16)__A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shldv_epi32 (__m256i __A, __m256i __B, __m256i __C)
+{
+  return (__m256i) __builtin_ia32_vpshldv_v8si ((__v8si)__A, (__v8si) __B,
+                                                               (__v8si) __C);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shldv_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D)
+{
+  return (__m256i)__builtin_ia32_vpshldv_v8si_mask ((__v8si)__A, (__v8si) __C,
+                                               (__v8si) __D, (__mmask8)__B) ;
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shldv_epi32 (__mmask8 __A, __m256i __B, __m256i __C, __m256i __D)
+{
+  return (__m256i)__builtin_ia32_vpshldv_v8si_maskz ((__v8si)__B, (__v8si) __C,
+                                               (__v8si) __D, (__mmask8)__A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shldv_epi64 (__m256i __A, __m256i __B, __m256i __C)
+{
+  return (__m256i) __builtin_ia32_vpshldv_v4di ((__v4di)__A, (__v4di) __B,
+                                                               (__v4di) __C);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shldv_epi64 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D)
+{
+  return (__m256i)__builtin_ia32_vpshldv_v4di_mask ((__v4di)__A, (__v4di) __C,
+                                               (__v4di) __D, (__mmask8)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shldv_epi64 (__mmask8 __A, __m256i __B, __m256i __C, __m256i __D)
+{
+  return (__m256i)__builtin_ia32_vpshldv_v4di_maskz ((__v4di)__B, (__v4di) __C,
+                                                (__v4di) __D, (__mmask8)__A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shldv_epi16 (__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i) __builtin_ia32_vpshldv_v8hi ((__v8hi)__A, (__v8hi) __B,
+                                                               (__v8hi) __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shldv_epi16 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D)
+{
+  return (__m128i)__builtin_ia32_vpshldv_v8hi_mask ((__v8hi)__A, (__v8hi) __C,
+                                               (__v8hi) __D, (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shldv_epi16 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D)
+{
+  return (__m128i)__builtin_ia32_vpshldv_v8hi_maskz ((__v8hi)__B, (__v8hi) __C,
+                                                (__v8hi) __D, (__mmask8)__A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shldv_epi32 (__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i) __builtin_ia32_vpshldv_v4si ((__v4si)__A, (__v4si) __B,
+                                                               (__v4si) __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shldv_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D)
+{
+  return (__m128i)__builtin_ia32_vpshldv_v4si_mask ((__v4si)__A, (__v4si) __C,
+                                               (__v4si) __D, (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shldv_epi32 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D)
+{
+  return (__m128i)__builtin_ia32_vpshldv_v4si_maskz ((__v4si)__B, (__v4si) __C,
+                                                (__v4si) __D, (__mmask8)__A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shldv_epi64 (__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i) __builtin_ia32_vpshldv_v2di ((__v2di)__A, (__v2di) __B,
+                                                               (__v2di) __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shldv_epi64 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D)
+{
+  return (__m128i)__builtin_ia32_vpshldv_v2di_mask ((__v2di)__A, (__v2di) __C,
+                                               (__v2di) __D, (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shldv_epi64 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D)
+{
+  return (__m128i)__builtin_ia32_vpshldv_v2di_maskz ((__v2di)__B, (__v2di) __C,
+                                               (__v2di) __D, (__mmask8)__A);
+}
+
+
+
+
+#ifdef __DISABLE_AVX512VBMI2VL__
+#undef __DISABLE_AVX512VBMI2VL__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512VBMIVL__ */
+
+#if !defined(__AVX512VL__) || !defined(__AVX512VBMI2__) || \
+    !defined(__AVX512BW__)
+#pragma GCC push_options
+#pragma GCC target("avx512vbmi2,avx512vl,avx512bw")
+#define __DISABLE_AVX512VBMI2VLBW__
+#endif /* __AVX512VBMIVLBW__ */
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_compress_epi8 (__m256i __A, __mmask32 __B, __m256i __C)
+{
+  return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi)__C,
+                                               (__v32qi)__A, (__mmask32)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_compress_epi8 (__mmask32 __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __B,
+                       (__v32qi) _mm256_setzero_si256 (), (__mmask32) __A);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_compressstoreu_epi8 (void * __A, __mmask32 __B, __m256i __C)
+{
+  __builtin_ia32_compressstoreuqi256_mask ((__v32qi *) __A, (__v32qi) __C,
+                                                       (__mmask32) __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_expand_epi8 (__m256i __A, __mmask32 __B, __m256i __C)
+{
+  return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __C,
+                                                   (__v32qi) __A,
+                                                   (__mmask32) __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_expand_epi8 (__mmask32 __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_expandqi256_maskz ((__v32qi) __B,
+                       (__v32qi) _mm256_setzero_si256 (), (__mmask32) __A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_expandloadu_epi8 (__m256i __A, __mmask32 __B, const void * __C)
+{
+  return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *) __C,
+                                       (__v32qi) __A, (__mmask32) __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_expandloadu_epi8 (__mmask32 __A, const void * __B)
+{
+  return (__m256i) __builtin_ia32_expandloadqi256_maskz ((const __v32qi *) __B,
+                       (__v32qi) _mm256_setzero_si256 (), (__mmask32) __A);
+}
+
+#ifdef __DISABLE_AVX512VBMI2VLBW__
+#undef __DISABLE_AVX512VBMI2VLBW__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512VBMIVLBW__ */
+
+#endif /* _AVX512VBMIVLINTRIN_H_INCLUDED */
diff --git a/include-gcc/avx512vbmiintrin.h b/include-gcc/avx512vbmiintrin.h
new file mode 100644 (file)
index 0000000..5025860
--- /dev/null
@@ -0,0 +1,158 @@
+/* Copyright (C) 2013-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512vbmiintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512VBMIINTRIN_H_INCLUDED
+#define _AVX512VBMIINTRIN_H_INCLUDED
+
+#ifndef __AVX512VBMI__
+#pragma GCC push_options
+#pragma GCC target("avx512vbmi")
+#define __DISABLE_AVX512VBMI__
+#endif /* __AVX512VBMI__ */
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_multishift_epi64_epi8 (__m512i __W, __mmask64 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X,
+                                                         (__v64qi) __Y,
+                                                         (__v64qi) __W,
+                                                         (__mmask64) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_multishift_epi64_epi8 (__mmask64 __M, __m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X,
+                                                         (__v64qi) __Y,
+                                                         (__v64qi)
+                                                         _mm512_setzero_si512 (),
+                                                         (__mmask64) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_multishift_epi64_epi8 (__m512i __X, __m512i __Y)
+{
+  return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X,
+                                                         (__v64qi) __Y,
+                                                         (__v64qi)
+                                                         _mm512_undefined_epi32 (),
+                                                         (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutexvar_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
+                                                    (__v64qi) __A,
+                                                    (__v64qi)
+                                                    _mm512_undefined_epi32 (),
+                                                    (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A,
+                               __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
+                                                    (__v64qi) __A,
+                                                    (__v64qi)
+                                                    _mm512_setzero_si512(),
+                                                    (__mmask64) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A,
+                              __m512i __B)
+{
+  return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B,
+                                                    (__v64qi) __A,
+                                                    (__v64qi) __W,
+                                                    (__mmask64) __M);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_permutex2var_epi8 (__m512i __A, __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varqi512_mask ((__v64qi) __I
+                                                       /* idx */ ,
+                                                       (__v64qi) __A,
+                                                       (__v64qi) __B,
+                                                       (__mmask64) -1);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_permutex2var_epi8 (__m512i __A, __mmask64 __U,
+                               __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varqi512_mask ((__v64qi) __I
+                                                       /* idx */ ,
+                                                       (__v64qi) __A,
+                                                       (__v64qi) __B,
+                                                       (__mmask64)
+                                                       __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask2_permutex2var_epi8 (__m512i __A, __m512i __I,
+                                __mmask64 __U, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermi2varqi512_mask ((__v64qi) __A,
+                                                       (__v64qi) __I
+                                                       /* idx */ ,
+                                                       (__v64qi) __B,
+                                                       (__mmask64)
+                                                       __U);
+}
+
+extern __inline __m512i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_permutex2var_epi8 (__mmask64 __U, __m512i __A,
+                                __m512i __I, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vpermt2varqi512_maskz ((__v64qi) __I
+                                                        /* idx */ ,
+                                                        (__v64qi) __A,
+                                                        (__v64qi) __B,
+                                                        (__mmask64)
+                                                        __U);
+}
+
+#ifdef __DISABLE_AVX512VBMI__
+#undef __DISABLE_AVX512VBMI__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512VBMI__ */
+
+#endif /* _AVX512VBMIINTRIN_H_INCLUDED */
diff --git a/include-gcc/avx512vbmivlintrin.h b/include-gcc/avx512vbmivlintrin.h
new file mode 100644 (file)
index 0000000..035408f
--- /dev/null
@@ -0,0 +1,273 @@
+/* Copyright (C) 2013-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512vbmivlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512VBMIVLINTRIN_H_INCLUDED
+#define _AVX512VBMIVLINTRIN_H_INCLUDED
+
+#if !defined(__AVX512VL__) || !defined(__AVX512VBMI__)
+#pragma GCC push_options
+#pragma GCC target("avx512vbmi,avx512vl")
+#define __DISABLE_AVX512VBMIVL__
+#endif /* __AVX512VBMIVL__ */
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_multishift_epi64_epi8 (__m256i __W, __mmask32 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
+                                                         (__v32qi) __Y,
+                                                         (__v32qi) __W,
+                                                         (__mmask32) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_multishift_epi64_epi8 (__mmask32 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
+                                                         (__v32qi) __Y,
+                                                         (__v32qi)
+                                                         _mm256_setzero_si256 (),
+                                                         (__mmask32) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_multishift_epi64_epi8 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X,
+                                                         (__v32qi) __Y,
+                                                         (__v32qi)
+                                                         _mm256_undefined_si256 (),
+                                                         (__mmask32) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_multishift_epi64_epi8 (__m128i __W, __mmask16 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
+                                                         (__v16qi) __Y,
+                                                         (__v16qi) __W,
+                                                         (__mmask16) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_multishift_epi64_epi8 (__mmask16 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
+                                                         (__v16qi) __Y,
+                                                         (__v16qi)
+                                                         _mm_setzero_si128 (),
+                                                         (__mmask16) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_multishift_epi64_epi8 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X,
+                                                         (__v16qi) __Y,
+                                                         (__v16qi)
+                                                         _mm_undefined_si128 (),
+                                                         (__mmask16) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutexvar_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
+                                                    (__v32qi) __A,
+                                                    (__v32qi)
+                                                    _mm256_undefined_si256 (),
+                                                    (__mmask32) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A,
+                               __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
+                                                    (__v32qi) __A,
+                                                    (__v32qi)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask32) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
+                              __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B,
+                                                    (__v32qi) __A,
+                                                    (__v32qi) __W,
+                                                    (__mmask32) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutexvar_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
+                                                    (__v16qi) __A,
+                                                    (__v16qi)
+                                                    _mm_undefined_si128 (),
+                                                    (__mmask16) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
+                                                    (__v16qi) __A,
+                                                    (__v16qi)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask16) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
+                           __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B,
+                                                    (__v16qi) __A,
+                                                    (__v16qi) __W,
+                                                    (__mmask16) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutex2var_epi8 (__m256i __A, __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varqi256_mask ((__v32qi) __I
+                                                       /* idx */ ,
+                                                       (__v32qi) __A,
+                                                       (__v32qi) __B,
+                                                       (__mmask32) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutex2var_epi8 (__m256i __A, __mmask32 __U,
+                               __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varqi256_mask ((__v32qi) __I
+                                                       /* idx */ ,
+                                                       (__v32qi) __A,
+                                                       (__v32qi) __B,
+                                                       (__mmask32)
+                                                       __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask2_permutex2var_epi8 (__m256i __A, __m256i __I,
+                                __mmask32 __U, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermi2varqi256_mask ((__v32qi) __A,
+                                                       (__v32qi) __I
+                                                       /* idx */ ,
+                                                       (__v32qi) __B,
+                                                       (__mmask32)
+                                                       __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutex2var_epi8 (__mmask32 __U, __m256i __A,
+                                __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varqi256_maskz ((__v32qi) __I
+                                                        /* idx */ ,
+                                                        (__v32qi) __A,
+                                                        (__v32qi) __B,
+                                                        (__mmask32)
+                                                        __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutex2var_epi8 (__m128i __A, __m128i __I, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varqi128_mask ((__v16qi) __I
+                                                       /* idx */ ,
+                                                       (__v16qi) __A,
+                                                       (__v16qi) __B,
+                                                       (__mmask16) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_permutex2var_epi8 (__m128i __A, __mmask16 __U, __m128i __I,
+                            __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varqi128_mask ((__v16qi) __I
+                                                       /* idx */ ,
+                                                       (__v16qi) __A,
+                                                       (__v16qi) __B,
+                                                       (__mmask16)
+                                                       __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask2_permutex2var_epi8 (__m128i __A, __m128i __I, __mmask16 __U,
+                             __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermi2varqi128_mask ((__v16qi) __A,
+                                                       (__v16qi) __I
+                                                       /* idx */ ,
+                                                       (__v16qi) __B,
+                                                       (__mmask16)
+                                                       __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_permutex2var_epi8 (__mmask16 __U, __m128i __A, __m128i __I,
+                             __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varqi128_maskz ((__v16qi) __I
+                                                        /* idx */ ,
+                                                        (__v16qi) __A,
+                                                        (__v16qi) __B,
+                                                        (__mmask16)
+                                                        __U);
+}
+
+#ifdef __DISABLE_AVX512VBMIVL__
+#undef __DISABLE_AVX512VBMIVL__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512VBMIVL__ */
+
+#endif /* _AVX512VBMIVLINTRIN_H_INCLUDED */
diff --git a/include-gcc/avx512vlbwintrin.h b/include-gcc/avx512vlbwintrin.h
new file mode 100644 (file)
index 0000000..0232783
--- /dev/null
@@ -0,0 +1,4758 @@
+/* Copyright (C) 2014-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512vlbwintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512VLBWINTRIN_H_INCLUDED
+#define _AVX512VLBWINTRIN_H_INCLUDED
+
+#if !defined(__AVX512VL__) || !defined(__AVX512BW__)
+#pragma GCC push_options
+#pragma GCC target("avx512vl,avx512bw")
+#define __DISABLE_AVX512VLBW__
+#endif /* __AVX512VLBW__ */
+
+/* Internal data types for implementing the intrinsics.  */
+typedef short __v16hi_u __attribute__ ((__vector_size__ (32),  \
+                                       __may_alias__, __aligned__ (1)));
+typedef short __v8hi_u __attribute__ ((__vector_size__ (16),   \
+                                      __may_alias__, __aligned__ (1)));
+typedef char __v32qi_u __attribute__ ((__vector_size__ (32),   \
+                                      __may_alias__, __aligned__ (1)));
+typedef char __v16qi_u __attribute__ ((__vector_size__ (16),   \
+                                      __may_alias__, __aligned__ (1)));
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mov_epi8 (__m256i __W, __mmask32 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_movdquqi256_mask ((__v32qi) __A,
+                                                   (__v32qi) __W,
+                                                   (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mov_epi8 (__mmask32 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_movdquqi256_mask ((__v32qi) __A,
+                                                   (__v32qi)
+                                                   _mm256_setzero_si256 (),
+                                                   (__mmask32) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mov_epi8 (__m128i __W, __mmask16 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_movdquqi128_mask ((__v16qi) __A,
+                                                   (__v16qi) __W,
+                                                   (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mov_epi8 (__mmask16 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_movdquqi128_mask ((__v16qi) __A,
+                                                   (__v16qi)
+                                                   _mm_setzero_si128 (),
+                                                   (__mmask16) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu_epi8 (void *__P, __m256i __A)
+{
+  *(__v32qi_u *) __P = (__v32qi_u) __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_storeu_epi8 (void *__P, __mmask32 __U, __m256i __A)
+{
+  __builtin_ia32_storedquqi256_mask ((char *) __P,
+                                    (__v32qi) __A,
+                                    (__mmask32) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_epi8 (void *__P, __m128i __A)
+{
+  *(__v16qi_u *) __P = (__v16qi_u) __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_storeu_epi8 (void *__P, __mmask16 __U, __m128i __A)
+{
+  __builtin_ia32_storedquqi128_mask ((char *) __P,
+                                    (__v16qi) __A,
+                                    (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu_epi16 (void const *__P)
+{
+  return (__m256i) (*(__v16hi_u *) __P);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_loadu_epi16 (__m256i __W, __mmask16 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddquhi256_mask ((const short *) __P,
+                                                    (__v16hi) __W,
+                                                    (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_loadu_epi16 (__mmask16 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddquhi256_mask ((const short *) __P,
+                                                    (__v16hi)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_epi16 (void const *__P)
+{
+  return (__m128i) (*(__v8hi_u *) __P);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_loadu_epi16 (__m128i __W, __mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddquhi128_mask ((const short *) __P,
+                                                    (__v8hi) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_loadu_epi16 (__mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddquhi128_mask ((const short *) __P,
+                                                    (__v8hi)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) __U);
+}
+
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mov_epi16 (__m256i __W, __mmask16 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_movdquhi256_mask ((__v16hi) __A,
+                                                   (__v16hi) __W,
+                                                   (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mov_epi16 (__mmask16 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_movdquhi256_mask ((__v16hi) __A,
+                                                   (__v16hi)
+                                                   _mm256_setzero_si256 (),
+                                                   (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mov_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_movdquhi128_mask ((__v8hi) __A,
+                                                   (__v8hi) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mov_epi16 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_movdquhi128_mask ((__v8hi) __A,
+                                                   (__v8hi)
+                                                   _mm_setzero_si128 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu_epi8 (void const *__P)
+{
+  return (__m256i) (*(__v32qi_u *) __P);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_loadu_epi8 (__m256i __W, __mmask32 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddquqi256_mask ((const char *) __P,
+                                                    (__v32qi) __W,
+                                                    (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_loadu_epi8 (__mmask32 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddquqi256_mask ((const char *) __P,
+                                                    (__v32qi)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask32) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_epi8 (void const *__P)
+{
+  return (__m128i) (*(__v16qi_u *) __P);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_loadu_epi8 (__m128i __W, __mmask16 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddquqi128_mask ((const char *) __P,
+                                                    (__v16qi) __W,
+                                                    (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_loadu_epi8 (__mmask16 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddquqi128_mask ((const char *) __P,
+                                                    (__v16qi)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi16_epi8 (__m256i __A)
+{
+
+  return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A,
+                                                 (__v16qi)_mm_undefined_si128(),
+                                                 (__mmask16) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi16_storeu_epi8 (void * __P, __mmask16 __M,__m256i __A)
+{
+  __builtin_ia32_pmovwb256mem_mask ((__v16qi *) __P , (__v16hi) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A,
+                                                 (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi16_epi8 (__mmask16 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A,
+                                                 (__v16qi)
+                                                 _mm_setzero_si128 (),
+                                                 __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsepi16_epi8 (__m128i __A)
+{
+
+  return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
+                                                  (__v16qi)_mm_undefined_si128(),
+                                                  (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask8 __M,__m128i __A)
+{
+  __builtin_ia32_pmovswb128mem_mask ((unsigned long long *) __P , (__v8hi) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
+                                                  (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtsepi16_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A,
+                                                  (__v16qi)
+                                                  _mm_setzero_si128 (),
+                                                  __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtsepi16_epi8 (__m256i __A)
+{
+
+  return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
+                                                  (__v16qi)_mm_undefined_si128(),
+                                                  (__mmask16) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask16 __M,__m256i __A)
+{
+  __builtin_ia32_pmovswb256mem_mask ((__v16qi *) __P , (__v16hi) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtsepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
+                                                  (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtsepi16_epi8 (__mmask16 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A,
+                                                  (__v16qi)
+                                                  _mm_setzero_si128 (),
+                                                  __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtusepi16_epi8 (__m128i __A)
+{
+
+  return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
+                                                   (__v16qi)_mm_undefined_si128(),
+                                                   (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask8 __M,__m128i __A)
+{
+  __builtin_ia32_pmovuswb128mem_mask ((unsigned long long *) __P , (__v8hi) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtusepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
+                                                   (__v16qi) __O,
+                                                   __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtusepi16_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A,
+                                                   (__v16qi)
+                                                   _mm_setzero_si128 (),
+                                                   __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtusepi16_epi8 (__m256i __A)
+{
+
+  return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
+                                                   (__v16qi)_mm_undefined_si128(),
+                                                   (__mmask16) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask16 __M,__m256i __A)
+{
+  __builtin_ia32_pmovuswb256mem_mask ((__v16qi *) __P , (__v16hi) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtusepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
+                                                   (__v16qi) __O,
+                                                   __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtusepi16_epi8 (__mmask16 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A,
+                                                   (__v16qi)
+                                                   _mm_setzero_si128 (),
+                                                   __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_broadcastb_epi8 (__m256i __O, __mmask32 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastb256_mask ((__v16qi) __A,
+                                                      (__v32qi) __O,
+                                                      __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_broadcastb_epi8 (__mmask32 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastb256_mask ((__v16qi) __A,
+                                                      (__v32qi)
+                                                      _mm256_setzero_si256 (),
+                                                      __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_set1_epi8 (__m256i __O, __mmask32 __M, char __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastb256_gpr_mask (__A,
+                                                          (__v32qi) __O,
+                                                          __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_set1_epi8 (__mmask32 __M, char __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastb256_gpr_mask (__A,
+                                                          (__v32qi)
+                                                          _mm256_setzero_si256 (),
+                                                          __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_broadcastb_epi8 (__m128i __O, __mmask16 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastb128_mask ((__v16qi) __A,
+                                                      (__v16qi) __O,
+                                                      __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_broadcastb_epi8 (__mmask16 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastb128_mask ((__v16qi) __A,
+                                                      (__v16qi)
+                                                      _mm_setzero_si128 (),
+                                                      __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastb128_gpr_mask (__A,
+                                                          (__v16qi) __O,
+                                                          __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_set1_epi8 (__mmask16 __M, char __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastb128_gpr_mask (__A,
+                                                          (__v16qi)
+                                                          _mm_setzero_si128 (),
+                                                          __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_broadcastw_epi16 (__m256i __O, __mmask16 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastw256_mask ((__v8hi) __A,
+                                                      (__v16hi) __O,
+                                                      __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_broadcastw_epi16 (__mmask16 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastw256_mask ((__v8hi) __A,
+                                                      (__v16hi)
+                                                      _mm256_setzero_si256 (),
+                                                      __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_set1_epi16 (__m256i __O, __mmask16 __M, short __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastw256_gpr_mask (__A,
+                                                          (__v16hi) __O,
+                                                          __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_set1_epi16 (__mmask16 __M, short __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastw256_gpr_mask (__A,
+                                                          (__v16hi)
+                                                          _mm256_setzero_si256 (),
+                                                          __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_broadcastw_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastw128_mask ((__v8hi) __A,
+                                                      (__v8hi) __O,
+                                                      __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_broadcastw_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastw128_mask ((__v8hi) __A,
+                                                      (__v8hi)
+                                                      _mm_setzero_si128 (),
+                                                      __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_set1_epi16 (__m128i __O, __mmask8 __M, short __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastw128_gpr_mask (__A,
+                                                          (__v8hi) __O,
+                                                          __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_set1_epi16 (__mmask8 __M, short __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastw128_gpr_mask (__A,
+                                                          (__v8hi)
+                                                          _mm_setzero_si128 (),
+                                                          __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutexvar_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B,
+                                                    (__v16hi) __A,
+                                                    (__v16hi)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask16) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutexvar_epi16 (__mmask16 __M, __m256i __A,
+                               __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B,
+                                                    (__v16hi) __A,
+                                                    (__v16hi)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask16) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
+                              __m256i __B)
+{
+  return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B,
+                                                    (__v16hi) __A,
+                                                    (__v16hi) __W,
+                                                    (__mmask16) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutexvar_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B,
+                                                    (__v8hi) __A,
+                                                    (__v8hi)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_permutexvar_epi16 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B,
+                                                    (__v8hi) __A,
+                                                    (__v8hi)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_permutexvar_epi16 (__m128i __W, __mmask8 __M, __m128i __A,
+                           __m128i __B)
+{
+  return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B,
+                                                    (__v8hi) __A,
+                                                    (__v8hi) __W,
+                                                    (__mmask8) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutex2var_epi16 (__m256i __A, __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varhi256_mask ((__v16hi) __I
+                                                       /* idx */ ,
+                                                       (__v16hi) __A,
+                                                       (__v16hi) __B,
+                                                       (__mmask16) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutex2var_epi16 (__m256i __A, __mmask16 __U,
+                               __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varhi256_mask ((__v16hi) __I
+                                                       /* idx */ ,
+                                                       (__v16hi) __A,
+                                                       (__v16hi) __B,
+                                                       (__mmask16)
+                                                       __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask2_permutex2var_epi16 (__m256i __A, __m256i __I,
+                                __mmask16 __U, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermi2varhi256_mask ((__v16hi) __A,
+                                                       (__v16hi) __I
+                                                       /* idx */ ,
+                                                       (__v16hi) __B,
+                                                       (__mmask16)
+                                                       __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A,
+                                __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varhi256_maskz ((__v16hi) __I
+                                                        /* idx */ ,
+                                                        (__v16hi) __A,
+                                                        (__v16hi) __B,
+                                                        (__mmask16)
+                                                        __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutex2var_epi16 (__m128i __A, __m128i __I, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varhi128_mask ((__v8hi) __I
+                                                       /* idx */ ,
+                                                       (__v8hi) __A,
+                                                       (__v8hi) __B,
+                                                       (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_permutex2var_epi16 (__m128i __A, __mmask8 __U, __m128i __I,
+                            __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varhi128_mask ((__v8hi) __I
+                                                       /* idx */ ,
+                                                       (__v8hi) __A,
+                                                       (__v8hi) __B,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask2_permutex2var_epi16 (__m128i __A, __m128i __I, __mmask8 __U,
+                             __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermi2varhi128_mask ((__v8hi) __A,
+                                                       (__v8hi) __I
+                                                       /* idx */ ,
+                                                       (__v8hi) __B,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_permutex2var_epi16 (__mmask8 __U, __m128i __A, __m128i __I,
+                             __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varhi128_maskz ((__v8hi) __I
+                                                        /* idx */ ,
+                                                        (__v8hi) __A,
+                                                        (__v8hi) __B,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_maddubs_epi16 (__m256i __W, __mmask16 __U, __m256i __X,
+                          __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_pmaddubsw256_mask ((__v32qi) __X,
+                                                    (__v32qi) __Y,
+                                                    (__v16hi) __W,
+                                                    (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_maddubs_epi16 (__mmask16 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_pmaddubsw256_mask ((__v32qi) __X,
+                                                    (__v32qi) __Y,
+                                                    (__v16hi)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_maddubs_epi16 (__m128i __W, __mmask8 __U, __m128i __X,
+                       __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pmaddubsw128_mask ((__v16qi) __X,
+                                                    (__v16qi) __Y,
+                                                    (__v8hi) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_maddubs_epi16 (__mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pmaddubsw128_mask ((__v16qi) __X,
+                                                    (__v16qi) __Y,
+                                                    (__v8hi)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_madd_epi16 (__m256i __W, __mmask8 __U, __m256i __A,
+                       __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaddwd256_mask ((__v16hi) __A,
+                                                  (__v16hi) __B,
+                                                  (__v8si) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_madd_epi16 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaddwd256_mask ((__v16hi) __A,
+                                                  (__v16hi) __B,
+                                                  (__v8si)
+                                                  _mm256_setzero_si256 (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_madd_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+                    __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaddwd128_mask ((__v8hi) __A,
+                                                  (__v8hi) __B,
+                                                  (__v4si) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_madd_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaddwd128_mask ((__v8hi) __A,
+                                                  (__v8hi) __B,
+                                                  (__v4si)
+                                                  _mm_setzero_si128 (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movepi8_mask (__m128i __A)
+{
+  return (__mmask16) __builtin_ia32_cvtb2mask128 ((__v16qi) __A);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movepi8_mask (__m256i __A)
+{
+  return (__mmask32) __builtin_ia32_cvtb2mask256 ((__v32qi) __A);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movepi16_mask (__m128i __A)
+{
+  return (__mmask8) __builtin_ia32_cvtw2mask128 ((__v8hi) __A);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movepi16_mask (__m256i __A)
+{
+  return (__mmask16) __builtin_ia32_cvtw2mask256 ((__v16hi) __A);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movm_epi8 (__mmask16 __A)
+{
+  return (__m128i) __builtin_ia32_cvtmask2b128 (__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movm_epi8 (__mmask32 __A)
+{
+  return (__m256i) __builtin_ia32_cvtmask2b256 (__A);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movm_epi16 (__mmask8 __A)
+{
+  return (__m128i) __builtin_ia32_cvtmask2w128 (__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movm_epi16 (__mmask16 __A)
+{
+  return (__m256i) __builtin_ia32_cvtmask2w256 (__A);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_test_epi8_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestmb128 ((__v16qi) __A,
+                                               (__v16qi) __B,
+                                               (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_test_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestmb128 ((__v16qi) __A,
+                                               (__v16qi) __B, __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_test_epi8_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestmb256 ((__v32qi) __A,
+                                               (__v32qi) __B,
+                                               (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_test_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestmb256 ((__v32qi) __A,
+                                               (__v32qi) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_test_epi16_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmw128 ((__v8hi) __A,
+                                              (__v8hi) __B,
+                                              (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_test_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmw128 ((__v8hi) __A,
+                                              (__v8hi) __B, __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_test_epi16_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestmw256 ((__v16hi) __A,
+                                               (__v16hi) __B,
+                                               (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_test_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestmw256 ((__v16hi) __A,
+                                               (__v16hi) __B, __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_min_epu16 (__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminuw256_mask ((__v16hi) __A,
+                                                 (__v16hi) __B,
+                                                 (__v16hi)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask16) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_min_epu16 (__m256i __W, __mmask16 __M, __m256i __A,
+                      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminuw256_mask ((__v16hi) __A,
+                                                 (__v16hi) __B,
+                                                 (__v16hi) __W,
+                                                 (__mmask16) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_epu16 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminuw128_mask ((__v8hi) __A,
+                                                 (__v8hi) __B,
+                                                 (__v8hi)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_epu16 (__m128i __W, __mmask8 __M, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminuw128_mask ((__v8hi) __A,
+                                                 (__v8hi) __B,
+                                                 (__v8hi) __W,
+                                                 (__mmask8) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_min_epi16 (__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminsw256_mask ((__v16hi) __A,
+                                                 (__v16hi) __B,
+                                                 (__v16hi)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask16) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_min_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
+                      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminsw256_mask ((__v16hi) __A,
+                                                 (__v16hi) __B,
+                                                 (__v16hi) __W,
+                                                 (__mmask16) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_max_epu8 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxub256_mask ((__v32qi) __A,
+                                                 (__v32qi) __B,
+                                                 (__v32qi)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask32) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_max_epu8 (__m256i __W, __mmask32 __M, __m256i __A,
+                     __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxub256_mask ((__v32qi) __A,
+                                                 (__v32qi) __B,
+                                                 (__v32qi) __W,
+                                                 (__mmask32) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_epu8 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxub128_mask ((__v16qi) __A,
+                                                 (__v16qi) __B,
+                                                 (__v16qi)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask16) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_epu8 (__m128i __W, __mmask16 __M, __m128i __A,
+                  __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxub128_mask ((__v16qi) __A,
+                                                 (__v16qi) __B,
+                                                 (__v16qi) __W,
+                                                 (__mmask16) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_max_epi8 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxsb256_mask ((__v32qi) __A,
+                                                 (__v32qi) __B,
+                                                 (__v32qi)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask32) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_max_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
+                     __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxsb256_mask ((__v32qi) __A,
+                                                 (__v32qi) __B,
+                                                 (__v32qi) __W,
+                                                 (__mmask32) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxsb128_mask ((__v16qi) __A,
+                                                 (__v16qi) __B,
+                                                 (__v16qi)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask16) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
+                  __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxsb128_mask ((__v16qi) __A,
+                                                 (__v16qi) __B,
+                                                 (__v16qi) __W,
+                                                 (__mmask16) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_min_epu8 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminub256_mask ((__v32qi) __A,
+                                                 (__v32qi) __B,
+                                                 (__v32qi)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask32) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_min_epu8 (__m256i __W, __mmask32 __M, __m256i __A,
+                     __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminub256_mask ((__v32qi) __A,
+                                                 (__v32qi) __B,
+                                                 (__v32qi) __W,
+                                                 (__mmask32) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_epu8 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminub128_mask ((__v16qi) __A,
+                                                 (__v16qi) __B,
+                                                 (__v16qi)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask16) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_epu8 (__m128i __W, __mmask16 __M, __m128i __A,
+                  __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminub128_mask ((__v16qi) __A,
+                                                 (__v16qi) __B,
+                                                 (__v16qi) __W,
+                                                 (__mmask16) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_min_epi8 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminsb256_mask ((__v32qi) __A,
+                                                 (__v32qi) __B,
+                                                 (__v32qi)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask32) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_min_epi8 (__m256i __W, __mmask32 __M, __m256i __A,
+                     __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminsb256_mask ((__v32qi) __A,
+                                                 (__v32qi) __B,
+                                                 (__v32qi) __W,
+                                                 (__mmask32) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_epi8 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminsb128_mask ((__v16qi) __A,
+                                                 (__v16qi) __B,
+                                                 (__v16qi)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask16) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_epi8 (__m128i __W, __mmask16 __M, __m128i __A,
+                  __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminsb128_mask ((__v16qi) __A,
+                                                 (__v16qi) __B,
+                                                 (__v16qi) __W,
+                                                 (__mmask16) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_max_epi16 (__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxsw256_mask ((__v16hi) __A,
+                                                 (__v16hi) __B,
+                                                 (__v16hi)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask16) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_max_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
+                      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxsw256_mask ((__v16hi) __A,
+                                                 (__v16hi) __B,
+                                                 (__v16hi) __W,
+                                                 (__mmask16) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_epi16 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxsw128_mask ((__v8hi) __A,
+                                                 (__v8hi) __B,
+                                                 (__v8hi)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_epi16 (__m128i __W, __mmask8 __M, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxsw128_mask ((__v8hi) __A,
+                                                 (__v8hi) __B,
+                                                 (__v8hi) __W,
+                                                 (__mmask8) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_max_epu16 (__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxuw256_mask ((__v16hi) __A,
+                                                 (__v16hi) __B,
+                                                 (__v16hi)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask16) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_max_epu16 (__m256i __W, __mmask16 __M, __m256i __A,
+                      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxuw256_mask ((__v16hi) __A,
+                                                 (__v16hi) __B,
+                                                 (__v16hi) __W,
+                                                 (__mmask16) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_epu16 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxuw128_mask ((__v8hi) __A,
+                                                 (__v8hi) __B,
+                                                 (__v8hi)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_epu16 (__m128i __W, __mmask8 __M, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxuw128_mask ((__v8hi) __A,
+                                                 (__v8hi) __B,
+                                                 (__v8hi) __W,
+                                                 (__mmask8) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_epi16 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminsw128_mask ((__v8hi) __A,
+                                                 (__v8hi) __B,
+                                                 (__v8hi)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_epi16 (__m128i __W, __mmask8 __M, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminsw128_mask ((__v8hi) __A,
+                                                 (__v8hi) __B,
+                                                 (__v8hi) __W,
+                                                 (__mmask8) __M);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_alignr_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
+                        __m256i __B, const int __N)
+{
+  return (__m256i) __builtin_ia32_palignr256_mask ((__v4di) __A,
+                                                  (__v4di) __B,
+                                                  __N * 8,
+                                                  (__v4di) __W,
+                                                  (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_alignr_epi8 (__mmask32 __U, __m256i __A, __m256i __B,
+                         const int __N)
+{
+  return (__m256i) __builtin_ia32_palignr256_mask ((__v4di) __A,
+                                                  (__v4di) __B,
+                                                  __N * 8,
+                                                  (__v4di)
+                                                  _mm256_setzero_si256 (),
+                                                  (__mmask32) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_alignr_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
+                     __m128i __B, const int __N)
+{
+  return (__m128i) __builtin_ia32_palignr128_mask ((__v2di) __A,
+                                                  (__v2di) __B,
+                                                  __N * 8,
+                                                  (__v2di) __W,
+                                                  (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_alignr_epi8 (__mmask16 __U, __m128i __A, __m128i __B,
+                      const int __N)
+{
+  return (__m128i) __builtin_ia32_palignr128_mask ((__v2di) __A,
+                                                  (__v2di) __B,
+                                                  __N * 8,
+                                                  (__v2di)
+                                                  _mm_setzero_si128 (),
+                                                  (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_dbsad_epu8 (__m256i __A, __m256i __B, const int __imm)
+{
+  return (__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi) __A,
+                                                   (__v32qi) __B,
+                                                   __imm,
+                                                   (__v16hi)
+                                                   _mm256_setzero_si256 (),
+                                                   (__mmask16) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_dbsad_epu8 (__m256i __W, __mmask16 __U, __m256i __A,
+                       __m256i __B, const int __imm)
+{
+  return (__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi) __A,
+                                                   (__v32qi) __B,
+                                                   __imm,
+                                                   (__v16hi) __W,
+                                                   (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_dbsad_epu8 (__mmask16 __U, __m256i __A, __m256i __B,
+                        const int __imm)
+{
+  return (__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi) __A,
+                                                   (__v32qi) __B,
+                                                   __imm,
+                                                   (__v16hi)
+                                                   _mm256_setzero_si256 (),
+                                                   (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_dbsad_epu8 (__m128i __A, __m128i __B, const int __imm)
+{
+  return (__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi) __A,
+                                                   (__v16qi) __B,
+                                                   __imm,
+                                                   (__v8hi)
+                                                   _mm_setzero_si128 (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_dbsad_epu8 (__m128i __W, __mmask8 __U, __m128i __A,
+                    __m128i __B, const int __imm)
+{
+  return (__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi) __A,
+                                                   (__v16qi) __B,
+                                                   __imm,
+                                                   (__v8hi) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_dbsad_epu8 (__mmask8 __U, __m128i __A, __m128i __B,
+                     const int __imm)
+{
+  return (__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi) __A,
+                                                   (__v16qi) __B,
+                                                   __imm,
+                                                   (__v8hi)
+                                                   _mm_setzero_si128 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_blend_epi16 (__mmask8 __U, __m128i __A, __m128i __W)
+{
+  return (__m128i) __builtin_ia32_blendmw_128_mask ((__v8hi) __A,
+                                                   (__v8hi) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_blend_epi8 (__mmask16 __U, __m128i __A, __m128i __W)
+{
+  return (__m128i) __builtin_ia32_blendmb_128_mask ((__v16qi) __A,
+                                                   (__v16qi) __W,
+                                                   (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_blend_epi16 (__mmask16 __U, __m256i __A, __m256i __W)
+{
+  return (__m256i) __builtin_ia32_blendmw_256_mask ((__v16hi) __A,
+                                                   (__v16hi) __W,
+                                                   (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_blend_epi8 (__mmask32 __U, __m256i __A, __m256i __W)
+{
+  return (__m256i) __builtin_ia32_blendmb_256_mask ((__v32qi) __A,
+                                                   (__v32qi) __W,
+                                                   (__mmask32) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_epi16_mask (__mmask8 __U, __m128i __X, __m128i __Y,
+                        const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
+                                                (__v8hi) __Y, __P,
+                                                (__mmask8) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_epi16_mask (__m128i __X, __m128i __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
+                                                (__v8hi) __Y, __P,
+                                                (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmp_epi16_mask (__mmask16 __U, __m256i __X, __m256i __Y,
+                           const int __P)
+{
+  return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
+                                                 (__v16hi) __Y, __P,
+                                                 (__mmask16) __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_epi16_mask (__m256i __X, __m256i __Y, const int __P)
+{
+  return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
+                                                 (__v16hi) __Y, __P,
+                                                 (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_epi8_mask (__mmask16 __U, __m128i __X, __m128i __Y,
+                       const int __P)
+{
+  return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
+                                                 (__v16qi) __Y, __P,
+                                                 (__mmask16) __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_epi8_mask (__m128i __X, __m128i __Y, const int __P)
+{
+  return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
+                                                 (__v16qi) __Y, __P,
+                                                 (__mmask16) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmp_epi8_mask (__mmask32 __U, __m256i __X, __m256i __Y,
+                          const int __P)
+{
+  return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
+                                                 (__v32qi) __Y, __P,
+                                                 (__mmask32) __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_epi8_mask (__m256i __X, __m256i __Y, const int __P)
+{
+  return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
+                                                 (__v32qi) __Y, __P,
+                                                 (__mmask32) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_epu16_mask (__mmask8 __U, __m128i __X, __m128i __Y,
+                        const int __P)
+{
+  return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
+                                                 (__v8hi) __Y, __P,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_epu16_mask (__m128i __X, __m128i __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
+                                                 (__v8hi) __Y, __P,
+                                                 (__mmask8) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmp_epu16_mask (__mmask16 __U, __m256i __X, __m256i __Y,
+                           const int __P)
+{
+  return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
+                                                  (__v16hi) __Y, __P,
+                                                  (__mmask16) __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_epu16_mask (__m256i __X, __m256i __Y, const int __P)
+{
+  return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
+                                                  (__v16hi) __Y, __P,
+                                                  (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_epu8_mask (__mmask16 __U, __m128i __X, __m128i __Y,
+                       const int __P)
+{
+  return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
+                                                  (__v16qi) __Y, __P,
+                                                  (__mmask16) __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_epu8_mask (__m128i __X, __m128i __Y, const int __P)
+{
+  return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
+                                                  (__v16qi) __Y, __P,
+                                                  (__mmask16) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmp_epu8_mask (__mmask32 __U, __m256i __X, __m256i __Y,
+                          const int __P)
+{
+  return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
+                                                  (__v32qi) __Y, __P,
+                                                  (__mmask32) __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_epu8_mask (__m256i __X, __m256i __Y, const int __P)
+{
+  return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
+                                                  (__v32qi) __Y, __P,
+                                                  (__mmask32) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_srli_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+                       const int __imm)
+{
+  return (__m256i) __builtin_ia32_psrlwi256_mask ((__v16hi) __A, __imm,
+                                                 (__v16hi) __W,
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_srli_epi16 (__mmask16 __U, __m256i __A, const int __imm)
+{
+  return (__m256i) __builtin_ia32_psrlwi256_mask ((__v16hi) __A, __imm,
+                                                 (__v16hi)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_srli_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+                    const int __imm)
+{
+  return (__m128i) __builtin_ia32_psrlwi128_mask ((__v8hi) __A, __imm,
+                                                 (__v8hi) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_srli_epi16 (__mmask8 __U, __m128i __A, const int __imm)
+{
+  return (__m128i) __builtin_ia32_psrlwi128_mask ((__v8hi) __A, __imm,
+                                                 (__v8hi)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shufflehi_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+                            const int __imm)
+{
+  return (__m256i) __builtin_ia32_pshufhw256_mask ((__v16hi) __A,
+                                                  __imm,
+                                                  (__v16hi) __W,
+                                                  (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shufflehi_epi16 (__mmask16 __U, __m256i __A,
+                             const int __imm)
+{
+  return (__m256i) __builtin_ia32_pshufhw256_mask ((__v16hi) __A,
+                                                  __imm,
+                                                  (__v16hi)
+                                                  _mm256_setzero_si256 (),
+                                                  (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shufflehi_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+                         const int __imm)
+{
+  return (__m128i) __builtin_ia32_pshufhw128_mask ((__v8hi) __A, __imm,
+                                                  (__v8hi) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shufflehi_epi16 (__mmask8 __U, __m128i __A, const int __imm)
+{
+  return (__m128i) __builtin_ia32_pshufhw128_mask ((__v8hi) __A, __imm,
+                                                  (__v8hi)
+                                                  _mm_setzero_si128 (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shufflelo_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+                            const int __imm)
+{
+  return (__m256i) __builtin_ia32_pshuflw256_mask ((__v16hi) __A,
+                                                  __imm,
+                                                  (__v16hi) __W,
+                                                  (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shufflelo_epi16 (__mmask16 __U, __m256i __A,
+                             const int __imm)
+{
+  return (__m256i) __builtin_ia32_pshuflw256_mask ((__v16hi) __A,
+                                                  __imm,
+                                                  (__v16hi)
+                                                  _mm256_setzero_si256 (),
+                                                  (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shufflelo_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+                         const int __imm)
+{
+  return (__m128i) __builtin_ia32_pshuflw128_mask ((__v8hi) __A, __imm,
+                                                  (__v8hi) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shufflelo_epi16 (__mmask8 __U, __m128i __A, const int __imm)
+{
+  return (__m128i) __builtin_ia32_pshuflw128_mask ((__v8hi) __A, __imm,
+                                                  (__v8hi)
+                                                  _mm_setzero_si128 (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_srai_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+                       const int __imm)
+{
+  return (__m256i) __builtin_ia32_psrawi256_mask ((__v16hi) __A, __imm,
+                                                 (__v16hi) __W,
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_srai_epi16 (__mmask16 __U, __m256i __A, const int __imm)
+{
+  return (__m256i) __builtin_ia32_psrawi256_mask ((__v16hi) __A, __imm,
+                                                 (__v16hi)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_srai_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+                    const int __imm)
+{
+  return (__m128i) __builtin_ia32_psrawi128_mask ((__v8hi) __A, __imm,
+                                                 (__v8hi) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_srai_epi16 (__mmask8 __U, __m128i __A, const int __imm)
+{
+  return (__m128i) __builtin_ia32_psrawi128_mask ((__v8hi) __A, __imm,
+                                                 (__v8hi)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_slli_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+                       int __B)
+{
+  return (__m256i) __builtin_ia32_psllwi256_mask ((__v16hi) __A, __B,
+                                                 (__v16hi) __W,
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_slli_epi16 (__mmask16 __U, __m256i __A, int __B)
+{
+  return (__m256i) __builtin_ia32_psllwi256_mask ((__v16hi) __A, __B,
+                                                 (__v16hi)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_slli_epi16 (__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i) __builtin_ia32_psllwi128_mask ((__v8hi) __A, __B,
+                                                 (__v8hi) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i) __builtin_ia32_psllwi128_mask ((__v8hi) __A, __B,
+                                                 (__v8hi)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __U);
+}
+
+#else
+#define _mm256_mask_alignr_epi8(W, U, X, Y, N)                                     \
+  ((__m256i) __builtin_ia32_palignr256_mask ((__v4di)(__m256i)(X),                 \
+                                           (__v4di)(__m256i)(Y), (int)((N) * 8),   \
+                                           (__v4di)(__m256i)(X), (__mmask32)(U)))
+
+#define _mm256_mask_srli_epi16(W, U, A, B)                              \
+  ((__m256i) __builtin_ia32_psrlwi256_mask ((__v16hi)(__m256i)(A),      \
+    (int)(B), (__v16hi)(__m256i)(W), (__mmask16)(U)))
+
+#define _mm256_maskz_srli_epi16(U, A, B)                                \
+  ((__m256i) __builtin_ia32_psrlwi256_mask ((__v16hi)(__m256i)(A),      \
+    (int)(B), (__v16hi)_mm256_setzero_si256 (), (__mmask16)(U)))
+
+#define _mm_mask_srli_epi16(W, U, A, B)                                 \
+  ((__m128i) __builtin_ia32_psrlwi128_mask ((__v8hi)(__m128i)(A),       \
+    (int)(B), (__v8hi)(__m128i)(W), (__mmask8)(U)))
+
+#define _mm_maskz_srli_epi16(U, A, B)                                   \
+  ((__m128i) __builtin_ia32_psrlwi128_mask ((__v8hi)(__m128i)(A),       \
+    (int)(B), (__v8hi)_mm_setzero_si128(), (__mmask8)(U)))
+
+#define _mm256_mask_srai_epi16(W, U, A, B)                              \
+  ((__m256i) __builtin_ia32_psrawi256_mask ((__v16hi)(__m256i)(A),      \
+    (int)(B), (__v16hi)(__m256i)(W), (__mmask16)(U)))
+
+#define _mm256_maskz_srai_epi16(U, A, B)                                \
+  ((__m256i) __builtin_ia32_psrawi256_mask ((__v16hi)(__m256i)(A),      \
+    (int)(B), (__v16hi)_mm256_setzero_si256 (), (__mmask16)(U)))
+
+#define _mm_mask_srai_epi16(W, U, A, B)                                 \
+  ((__m128i) __builtin_ia32_psrawi128_mask ((__v8hi)(__m128i)(A),       \
+    (int)(B), (__v8hi)(__m128i)(W), (__mmask8)(U)))
+
+#define _mm_maskz_srai_epi16(U, A, B)                                   \
+  ((__m128i) __builtin_ia32_psrawi128_mask ((__v8hi)(__m128i)(A),       \
+    (int)(B), (__v8hi)_mm_setzero_si128(), (__mmask8)(U)))
+
+#define _mm256_mask_shufflehi_epi16(W, U, A, B)                                     \
+  ((__m256i) __builtin_ia32_pshufhw256_mask ((__v16hi)(__m256i)(A), (int)(B),       \
+                                             (__v16hi)(__m256i)(W),                 \
+                                             (__mmask16)(U)))
+
+#define _mm256_maskz_shufflehi_epi16(U, A, B)                                       \
+  ((__m256i) __builtin_ia32_pshufhw256_mask ((__v16hi)(__m256i)(A), (int)(B),       \
+                                             (__v16hi)(__m256i)_mm256_setzero_si256 (), \
+                                             (__mmask16)(U)))
+
+#define _mm_mask_shufflehi_epi16(W, U, A, B)                                        \
+  ((__m128i) __builtin_ia32_pshufhw128_mask ((__v8hi)(__m128i)(A), (int)(B),        \
+                                             (__v8hi)(__m128i)(W),                  \
+                                             (__mmask8)(U)))
+
+#define _mm_maskz_shufflehi_epi16(U, A, B)                                          \
+  ((__m128i) __builtin_ia32_pshufhw128_mask ((__v8hi)(__m128i)(A), (int)(B),        \
+                                            (__v8hi)(__m128i)_mm_setzero_si128 (), \
+                                             (__mmask8)(U)))
+
+#define _mm256_mask_shufflelo_epi16(W, U, A, B)                                     \
+  ((__m256i) __builtin_ia32_pshuflw256_mask ((__v16hi)(__m256i)(A), (int)(B),       \
+                                             (__v16hi)(__m256i)(W),                 \
+                                             (__mmask16)(U)))
+
+#define _mm256_maskz_shufflelo_epi16(U, A, B)                                       \
+  ((__m256i) __builtin_ia32_pshuflw256_mask ((__v16hi)(__m256i)(A), (int)(B),       \
+                                             (__v16hi)(__m256i)_mm256_setzero_si256 (), \
+                                             (__mmask16)(U)))
+
+#define _mm_mask_shufflelo_epi16(W, U, A, B)                                        \
+  ((__m128i) __builtin_ia32_pshuflw128_mask ((__v8hi)(__m128i)(A), (int)(B),        \
+                                             (__v8hi)(__m128i)(W),                  \
+                                             (__mmask8)(U)))
+
+#define _mm_maskz_shufflelo_epi16(U, A, B)                                          \
+  ((__m128i) __builtin_ia32_pshuflw128_mask ((__v8hi)(__m128i)(A), (int)(B),        \
+                                            (__v8hi)(__m128i)_mm_setzero_si128 (), \
+                                             (__mmask8)(U)))
+
+#define _mm256_maskz_alignr_epi8(U, X, Y, N)                                       \
+  ((__m256i) __builtin_ia32_palignr256_mask ((__v4di)(__m256i)(X),                 \
+                                           (__v4di)(__m256i)(Y), (int)((N) * 8),   \
+                                           (__v4di)(__m256i)_mm256_setzero_si256 (),   \
+                                           (__mmask32)(U)))
+
+#define _mm_mask_alignr_epi8(W, U, X, Y, N)                                        \
+  ((__m128i) __builtin_ia32_palignr128_mask ((__v2di)(__m128i)(X),                 \
+                                           (__v2di)(__m128i)(Y), (int)((N) * 8),   \
+                                           (__v2di)(__m128i)(X), (__mmask16)(U)))
+
+#define _mm_maskz_alignr_epi8(U, X, Y, N)                                          \
+  ((__m128i) __builtin_ia32_palignr128_mask ((__v2di)(__m128i)(X),                 \
+                                           (__v2di)(__m128i)(Y), (int)((N) * 8),   \
+                                           (__v2di)(__m128i)_mm_setzero_si128 (),  \
+                                           (__mmask16)(U)))
+
+#define _mm_mask_slli_epi16(W, U, X, C)                                          \
+  ((__m128i)__builtin_ia32_psllwi128_mask ((__v8hi)(__m128i)(X), (int)(C),\
+    (__v8hi)(__m128i)(W),\
+    (__mmask8)(U)))
+
+#define _mm_maskz_slli_epi16(U, X, C)                                    \
+  ((__m128i)__builtin_ia32_psllwi128_mask ((__v8hi)(__m128i)(X), (int)(C),\
+    (__v8hi)(__m128i)_mm_setzero_si128 (),\
+    (__mmask8)(U)))
+
+#define _mm256_dbsad_epu8(X, Y, C)                                                  \
+  ((__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi)(__m256i) (X),               \
+                                              (__v32qi)(__m256i) (Y), (int) (C),    \
+                                              (__v16hi)(__m256i)_mm256_setzero_si256(),\
+                                              (__mmask16)-1))
+
+#define _mm256_mask_slli_epi16(W, U, X, C)                                 \
+  ((__m256i)__builtin_ia32_psllwi256_mask ((__v16hi)(__m256i)(X), (int)(C),\
+    (__v16hi)(__m256i)(W),\
+    (__mmask16)(U)))
+
+#define _mm256_maskz_slli_epi16(U, X, C)                                   \
+  ((__m256i)__builtin_ia32_psllwi256_mask ((__v16hi)(__m256i)(X), (int)(C),\
+    (__v16hi)(__m256i)_mm256_setzero_si256 (),\
+    (__mmask16)(U)))
+
+#define _mm256_mask_dbsad_epu8(W, U, X, Y, C)                                       \
+  ((__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi)(__m256i) (X),               \
+                                              (__v32qi)(__m256i) (Y), (int) (C),    \
+                                              (__v16hi)(__m256i)(W),                \
+                                              (__mmask16)(U)))
+
+#define _mm256_maskz_dbsad_epu8(U, X, Y, C)                                         \
+  ((__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi)(__m256i) (X),               \
+                                              (__v32qi)(__m256i) (Y), (int) (C),    \
+                                              (__v16hi)(__m256i)_mm256_setzero_si256(),\
+                                              (__mmask16)(U)))
+
+#define _mm_dbsad_epu8(X, Y, C)                                                     \
+  ((__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi)(__m128i) (X),               \
+                                              (__v16qi)(__m128i) (Y), (int) (C),    \
+                                              (__v8hi)(__m128i)_mm_setzero_si128(), \
+                                              (__mmask8)-1))
+
+#define _mm_mask_dbsad_epu8(W, U, X, Y, C)                                          \
+  ((__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi)(__m128i) (X),               \
+                                              (__v16qi)(__m128i) (Y), (int) (C),    \
+                                              (__v8hi)(__m128i)(W),                 \
+                                              (__mmask8)(U)))
+
+#define _mm_maskz_dbsad_epu8(U, X, Y, C)                                            \
+  ((__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi)(__m128i) (X),               \
+                                              (__v16qi)(__m128i) (Y), (int) (C),    \
+                                              (__v8hi)(__m128i)_mm_setzero_si128(), \
+                                              (__mmask8)(U)))
+
+#define _mm_mask_blend_epi16(__U, __A, __W)                          \
+  ((__m128i) __builtin_ia32_blendmw_128_mask ((__v8hi) (__A),        \
+                                                   (__v8hi) (__W),   \
+                                                   (__mmask8) (__U)))
+
+#define _mm_mask_blend_epi8(__U, __A, __W)                           \
+  ((__m128i) __builtin_ia32_blendmb_128_mask ((__v16qi) (__A),       \
+                                                   (__v16qi) (__W),  \
+                                                   (__mmask16) (__U)))
+
+#define _mm256_mask_blend_epi16(__U, __A, __W)                       \
+  ((__m256i) __builtin_ia32_blendmw_256_mask ((__v16hi) (__A),       \
+                                                   (__v16hi) (__W),  \
+                                                   (__mmask16) (__U)))
+
+#define _mm256_mask_blend_epi8(__U, __A, __W)                        \
+  ((__m256i) __builtin_ia32_blendmb_256_mask ((__v32qi) (__A),       \
+                                                   (__v32qi) (__W),  \
+                                                   (__mmask32) (__U)))
+
+#define _mm_cmp_epi16_mask(X, Y, P)                            \
+  ((__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi)(__m128i)(X),       \
+                                           (__v8hi)(__m128i)(Y), (int)(P),\
+                                           (__mmask8)(-1)))
+
+#define _mm_cmp_epi8_mask(X, Y, P)                             \
+  ((__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi)(__m128i)(X),     \
+                                           (__v16qi)(__m128i)(Y), (int)(P),\
+                                           (__mmask16)(-1)))
+
+#define _mm256_cmp_epi16_mask(X, Y, P)                         \
+  ((__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi)(__m256i)(X),     \
+                                           (__v16hi)(__m256i)(Y), (int)(P),\
+                                           (__mmask16)(-1)))
+
+#define _mm256_cmp_epi8_mask(X, Y, P)                          \
+  ((__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi)(__m256i)(X),     \
+                                           (__v32qi)(__m256i)(Y), (int)(P),\
+                                           (__mmask32)(-1)))
+
+#define _mm_cmp_epu16_mask(X, Y, P)                            \
+  ((__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi)(__m128i)(X),      \
+                                           (__v8hi)(__m128i)(Y), (int)(P),\
+                                           (__mmask8)(-1)))
+
+#define _mm_cmp_epu8_mask(X, Y, P)                             \
+  ((__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi)(__m128i)(X),    \
+                                           (__v16qi)(__m128i)(Y), (int)(P),\
+                                           (__mmask16)(-1)))
+
+#define _mm256_cmp_epu16_mask(X, Y, P)                         \
+  ((__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi)(__m256i)(X),    \
+                                           (__v16hi)(__m256i)(Y), (int)(P),\
+                                           (__mmask16)(-1)))
+
+#define _mm256_cmp_epu8_mask(X, Y, P)                          \
+  ((__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi)(__m256i)(X),    \
+                                           (__v32qi)(__m256i)(Y), (int)(P),\
+                                           (__mmask32)-1))
+
+#define _mm_mask_cmp_epi16_mask(M, X, Y, P)                            \
+  ((__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi)(__m128i)(X),       \
+                                           (__v8hi)(__m128i)(Y), (int)(P),\
+                                           (__mmask8)(M)))
+
+#define _mm_mask_cmp_epi8_mask(M, X, Y, P)                             \
+  ((__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi)(__m128i)(X),     \
+                                           (__v16qi)(__m128i)(Y), (int)(P),\
+                                           (__mmask16)(M)))
+
+#define _mm256_mask_cmp_epi16_mask(M, X, Y, P)                         \
+  ((__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi)(__m256i)(X),     \
+                                           (__v16hi)(__m256i)(Y), (int)(P),\
+                                           (__mmask16)(M)))
+
+#define _mm256_mask_cmp_epi8_mask(M, X, Y, P)                          \
+  ((__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi)(__m256i)(X),     \
+                                           (__v32qi)(__m256i)(Y), (int)(P),\
+                                           (__mmask32)(M)))
+
+#define _mm_mask_cmp_epu16_mask(M, X, Y, P)                            \
+  ((__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi)(__m128i)(X),      \
+                                           (__v8hi)(__m128i)(Y), (int)(P),\
+                                           (__mmask8)(M)))
+
+#define _mm_mask_cmp_epu8_mask(M, X, Y, P)                             \
+  ((__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi)(__m128i)(X),    \
+                                           (__v16qi)(__m128i)(Y), (int)(P),\
+                                           (__mmask16)(M)))
+
+#define _mm256_mask_cmp_epu16_mask(M, X, Y, P)                         \
+  ((__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi)(__m256i)(X),    \
+                                           (__v16hi)(__m256i)(Y), (int)(P),\
+                                           (__mmask16)(M)))
+
+#define _mm256_mask_cmp_epu8_mask(M, X, Y, P)                          \
+  ((__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi)(__m256i)(X),    \
+                                           (__v32qi)(__m256i)(Y), (int)(P),\
+                                           (__mmask32)(M)))
+#endif
+
+extern __inline __mmask32
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpneq_epi8_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
+                                                 (__v32qi) __Y, 4,
+                                                 (__mmask32) -1);
+}
+
+extern __inline __mmask32
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmplt_epi8_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
+                                                 (__v32qi) __Y, 1,
+                                                 (__mmask32) -1);
+}
+
+extern __inline __mmask32
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpge_epi8_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
+                                                 (__v32qi) __Y, 5,
+                                                 (__mmask32) -1);
+}
+
+extern __inline __mmask32
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmple_epi8_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
+                                                 (__v32qi) __Y, 2,
+                                                 (__mmask32) -1);
+}
+
+extern __inline __mmask16
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpneq_epi16_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
+                                                 (__v16hi) __Y, 4,
+                                                 (__mmask16) -1);
+}
+
+extern __inline __mmask16
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmplt_epi16_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
+                                                 (__v16hi) __Y, 1,
+                                                 (__mmask16) -1);
+}
+
+extern __inline __mmask16
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpge_epi16_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
+                                                 (__v16hi) __Y, 5,
+                                                 (__mmask16) -1);
+}
+
+extern __inline __mmask16
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmple_epi16_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
+                                                 (__v16hi) __Y, 2,
+                                                 (__mmask16) -1);
+}
+
+extern __inline __mmask16
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_epu8_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
+                                                  (__v16qi) __Y, 4,
+                                                  (__mmask16) -1);
+}
+
+extern __inline __mmask16
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epu8_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
+                                                  (__v16qi) __Y, 1,
+                                                  (__mmask16) -1);
+}
+
+extern __inline __mmask16
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_epu8_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
+                                                  (__v16qi) __Y, 5,
+                                                  (__mmask16) -1);
+}
+
+extern __inline __mmask16
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_epu8_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
+                                                  (__v16qi) __Y, 2,
+                                                  (__mmask16) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_epu16_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
+                                                 (__v8hi) __Y, 4,
+                                                 (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epu16_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
+                                                 (__v8hi) __Y, 1,
+                                                 (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_epu16_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
+                                                 (__v8hi) __Y, 5,
+                                                 (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_epu16_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
+                                                 (__v8hi) __Y, 2,
+                                                 (__mmask8) -1);
+}
+
+extern __inline __mmask16
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_epi8_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
+                                                 (__v16qi) __Y, 4,
+                                                 (__mmask16) -1);
+}
+
+extern __inline __mmask16
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epi8_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
+                                                 (__v16qi) __Y, 1,
+                                                 (__mmask16) -1);
+}
+
+extern __inline __mmask16
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_epi8_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
+                                                 (__v16qi) __Y, 5,
+                                                 (__mmask16) -1);
+}
+
+extern __inline __mmask16
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_epi8_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
+                                                 (__v16qi) __Y, 2,
+                                                 (__mmask16) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_epi16_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
+                                                (__v8hi) __Y, 4,
+                                                (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epi16_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
+                                                (__v8hi) __Y, 1,
+                                                (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_epi16_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
+                                                (__v8hi) __Y, 5,
+                                                (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_epi16_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
+                                                (__v8hi) __Y, 2,
+                                                (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mulhrs_epi16 (__m256i __W, __mmask16 __U, __m256i __X,
+                         __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_pmulhrsw256_mask ((__v16hi) __X,
+                                                   (__v16hi) __Y,
+                                                   (__v16hi) __W,
+                                                   (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mulhrs_epi16 (__mmask16 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_pmulhrsw256_mask ((__v16hi) __X,
+                                                   (__v16hi) __Y,
+                                                   (__v16hi)
+                                                   _mm256_setzero_si256 (),
+                                                   (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mulhi_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
+                        __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmulhuw256_mask ((__v16hi) __A,
+                                                  (__v16hi) __B,
+                                                  (__v16hi) __W,
+                                                  (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mulhi_epu16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmulhuw256_mask ((__v16hi) __A,
+                                                  (__v16hi) __B,
+                                                  (__v16hi)
+                                                  _mm256_setzero_si256 (),
+                                                  (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mulhi_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+                        __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmulhw256_mask ((__v16hi) __A,
+                                                 (__v16hi) __B,
+                                                 (__v16hi) __W,
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mulhi_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmulhw256_mask ((__v16hi) __A,
+                                                 (__v16hi) __B,
+                                                 (__v16hi)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mulhi_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+                     __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmulhw128_mask ((__v8hi) __A,
+                                                 (__v8hi) __B,
+                                                 (__v8hi) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mulhi_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmulhw128_mask ((__v8hi) __A,
+                                                 (__v8hi) __B,
+                                                 (__v8hi)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mulhi_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
+                     __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmulhuw128_mask ((__v8hi) __A,
+                                                  (__v8hi) __B,
+                                                  (__v8hi) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mulhi_epu16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmulhuw128_mask ((__v8hi) __A,
+                                                  (__v8hi) __B,
+                                                  (__v8hi)
+                                                  _mm_setzero_si128 (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mulhrs_epi16 (__m128i __W, __mmask8 __U, __m128i __X,
+                      __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pmulhrsw128_mask ((__v8hi) __X,
+                                                   (__v8hi) __Y,
+                                                   (__v8hi) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mulhrs_epi16 (__mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pmulhrsw128_mask ((__v8hi) __X,
+                                                   (__v8hi) __Y,
+                                                   (__v8hi)
+                                                   _mm_setzero_si128 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mullo_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+                        __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmullw256_mask ((__v16hi) __A,
+                                                 (__v16hi) __B,
+                                                 (__v16hi) __W,
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mullo_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmullw256_mask ((__v16hi) __A,
+                                                 (__v16hi) __B,
+                                                 (__v16hi)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mullo_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+                     __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmullw128_mask ((__v8hi) __A,
+                                                 (__v8hi) __B,
+                                                 (__v8hi) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mullo_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmullw128_mask ((__v8hi) __A,
+                                                 (__v8hi) __B,
+                                                 (__v8hi)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi8_epi16 (__m256i __W, __mmask16 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsxbw256_mask ((__v16qi) __A,
+                                                   (__v16hi) __W,
+                                                   (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi8_epi16 (__mmask16 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsxbw256_mask ((__v16qi) __A,
+                                                   (__v16hi)
+                                                   _mm256_setzero_si256 (),
+                                                   (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi8_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsxbw128_mask ((__v16qi) __A,
+                                                   (__v8hi) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi8_epi16 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsxbw128_mask ((__v16qi) __A,
+                                                   (__v8hi)
+                                                   _mm_setzero_si128 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepu8_epi16 (__m256i __W, __mmask16 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovzxbw256_mask ((__v16qi) __A,
+                                                   (__v16hi) __W,
+                                                   (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovzxbw256_mask ((__v16qi) __A,
+                                                   (__v16hi)
+                                                   _mm256_setzero_si256 (),
+                                                   (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepu8_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovzxbw128_mask ((__v16qi) __A,
+                                                   (__v8hi) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepu8_epi16 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovzxbw128_mask ((__v16qi) __A,
+                                                   (__v8hi)
+                                                   _mm_setzero_si128 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_avg_epu8 (__m256i __W, __mmask32 __U, __m256i __A,
+                     __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pavgb256_mask ((__v32qi) __A,
+                                                (__v32qi) __B,
+                                                (__v32qi) __W,
+                                                (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_avg_epu8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pavgb256_mask ((__v32qi) __A,
+                                                (__v32qi) __B,
+                                                (__v32qi)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask32) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_avg_epu8 (__m128i __W, __mmask16 __U, __m128i __A,
+                  __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pavgb128_mask ((__v16qi) __A,
+                                                (__v16qi) __B,
+                                                (__v16qi) __W,
+                                                (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_avg_epu8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pavgb128_mask ((__v16qi) __A,
+                                                (__v16qi) __B,
+                                                (__v16qi)
+                                                _mm_setzero_si128 (),
+                                                (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_avg_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
+                      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pavgw256_mask ((__v16hi) __A,
+                                                (__v16hi) __B,
+                                                (__v16hi) __W,
+                                                (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_avg_epu16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pavgw256_mask ((__v16hi) __A,
+                                                (__v16hi) __B,
+                                                (__v16hi)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_avg_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pavgw128_mask ((__v8hi) __A,
+                                                (__v8hi) __B,
+                                                (__v8hi) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_avg_epu16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pavgw128_mask ((__v8hi) __A,
+                                                (__v8hi) __B,
+                                                (__v8hi)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_add_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
+                     __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddb256_mask ((__v32qi) __A,
+                                                (__v32qi) __B,
+                                                (__v32qi) __W,
+                                                (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_add_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddb256_mask ((__v32qi) __A,
+                                                (__v32qi) __B,
+                                                (__v32qi)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_add_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+                      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddw256_mask ((__v16hi) __A,
+                                                (__v16hi) __B,
+                                                (__v16hi) __W,
+                                                (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_add_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddw256_mask ((__v16hi) __A,
+                                                (__v16hi) __B,
+                                                (__v16hi)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_adds_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
+                      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddsb256_mask ((__v32qi) __A,
+                                                 (__v32qi) __B,
+                                                 (__v32qi) __W,
+                                                 (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_adds_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddsb256_mask ((__v32qi) __A,
+                                                 (__v32qi) __B,
+                                                 (__v32qi)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_adds_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+                       __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddsw256_mask ((__v16hi) __A,
+                                                 (__v16hi) __B,
+                                                 (__v16hi) __W,
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_adds_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddsw256_mask ((__v16hi) __A,
+                                                 (__v16hi) __B,
+                                                 (__v16hi)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_adds_epu8 (__m256i __W, __mmask32 __U, __m256i __A,
+                      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddusb256_mask ((__v32qi) __A,
+                                                  (__v32qi) __B,
+                                                  (__v32qi) __W,
+                                                  (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_adds_epu8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddusb256_mask ((__v32qi) __A,
+                                                  (__v32qi) __B,
+                                                  (__v32qi)
+                                                  _mm256_setzero_si256 (),
+                                                  (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_adds_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
+                       __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddusw256_mask ((__v16hi) __A,
+                                                  (__v16hi) __B,
+                                                  (__v16hi) __W,
+                                                  (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_adds_epu16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddusw256_mask ((__v16hi) __A,
+                                                  (__v16hi) __B,
+                                                  (__v16hi)
+                                                  _mm256_setzero_si256 (),
+                                                  (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sub_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
+                     __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubb256_mask ((__v32qi) __A,
+                                                (__v32qi) __B,
+                                                (__v32qi) __W,
+                                                (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sub_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubb256_mask ((__v32qi) __A,
+                                                (__v32qi) __B,
+                                                (__v32qi)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sub_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+                      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubw256_mask ((__v16hi) __A,
+                                                (__v16hi) __B,
+                                                (__v16hi) __W,
+                                                (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sub_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubw256_mask ((__v16hi) __A,
+                                                (__v16hi) __B,
+                                                (__v16hi)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_subs_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
+                      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubsb256_mask ((__v32qi) __A,
+                                                 (__v32qi) __B,
+                                                 (__v32qi) __W,
+                                                 (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_subs_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubsb256_mask ((__v32qi) __A,
+                                                 (__v32qi) __B,
+                                                 (__v32qi)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_subs_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+                       __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubsw256_mask ((__v16hi) __A,
+                                                 (__v16hi) __B,
+                                                 (__v16hi) __W,
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_subs_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubsw256_mask ((__v16hi) __A,
+                                                 (__v16hi) __B,
+                                                 (__v16hi)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_subs_epu8 (__m256i __W, __mmask32 __U, __m256i __A,
+                      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubusb256_mask ((__v32qi) __A,
+                                                  (__v32qi) __B,
+                                                  (__v32qi) __W,
+                                                  (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_subs_epu8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubusb256_mask ((__v32qi) __A,
+                                                  (__v32qi) __B,
+                                                  (__v32qi)
+                                                  _mm256_setzero_si256 (),
+                                                  (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_subs_epu16 (__m256i __W, __mmask16 __U, __m256i __A,
+                       __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubusw256_mask ((__v16hi) __A,
+                                                  (__v16hi) __B,
+                                                  (__v16hi) __W,
+                                                  (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_subs_epu16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubusw256_mask ((__v16hi) __A,
+                                                  (__v16hi) __B,
+                                                  (__v16hi)
+                                                  _mm256_setzero_si256 (),
+                                                  (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_add_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
+                  __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddb128_mask ((__v16qi) __A,
+                                                (__v16qi) __B,
+                                                (__v16qi) __W,
+                                                (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_add_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddb128_mask ((__v16qi) __A,
+                                                (__v16qi) __B,
+                                                (__v16qi)
+                                                _mm_setzero_si128 (),
+                                                (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_add_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddw128_mask ((__v8hi) __A,
+                                                (__v8hi) __B,
+                                                (__v8hi) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_add_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddw128_mask ((__v8hi) __A,
+                                                (__v8hi) __B,
+                                                (__v8hi)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_unpackhi_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
+                          __m256i __B)
+{
+  return (__m256i) __builtin_ia32_punpckhbw256_mask ((__v32qi) __A,
+                                                    (__v32qi) __B,
+                                                    (__v32qi) __W,
+                                                    (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_unpackhi_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_punpckhbw256_mask ((__v32qi) __A,
+                                                    (__v32qi) __B,
+                                                    (__v32qi)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask32) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_unpackhi_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
+                       __m128i __B)
+{
+  return (__m128i) __builtin_ia32_punpckhbw128_mask ((__v16qi) __A,
+                                                    (__v16qi) __B,
+                                                    (__v16qi) __W,
+                                                    (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_unpackhi_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_punpckhbw128_mask ((__v16qi) __A,
+                                                    (__v16qi) __B,
+                                                    (__v16qi)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_unpackhi_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+                           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_punpckhwd256_mask ((__v16hi) __A,
+                                                    (__v16hi) __B,
+                                                    (__v16hi) __W,
+                                                    (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_unpackhi_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_punpckhwd256_mask ((__v16hi) __A,
+                                                    (__v16hi) __B,
+                                                    (__v16hi)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_unpackhi_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+                        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_punpckhwd128_mask ((__v8hi) __A,
+                                                    (__v8hi) __B,
+                                                    (__v8hi) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_unpackhi_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_punpckhwd128_mask ((__v8hi) __A,
+                                                    (__v8hi) __B,
+                                                    (__v8hi)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_unpacklo_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
+                          __m256i __B)
+{
+  return (__m256i) __builtin_ia32_punpcklbw256_mask ((__v32qi) __A,
+                                                    (__v32qi) __B,
+                                                    (__v32qi) __W,
+                                                    (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_unpacklo_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_punpcklbw256_mask ((__v32qi) __A,
+                                                    (__v32qi) __B,
+                                                    (__v32qi)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask32) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_unpacklo_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
+                       __m128i __B)
+{
+  return (__m128i) __builtin_ia32_punpcklbw128_mask ((__v16qi) __A,
+                                                    (__v16qi) __B,
+                                                    (__v16qi) __W,
+                                                    (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_unpacklo_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_punpcklbw128_mask ((__v16qi) __A,
+                                                    (__v16qi) __B,
+                                                    (__v16qi)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_unpacklo_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+                           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_punpcklwd256_mask ((__v16hi) __A,
+                                                    (__v16hi) __B,
+                                                    (__v16hi) __W,
+                                                    (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_unpacklo_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_punpcklwd256_mask ((__v16hi) __A,
+                                                    (__v16hi) __B,
+                                                    (__v16hi)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_unpacklo_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+                        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_punpcklwd128_mask ((__v8hi) __A,
+                                                    (__v8hi) __B,
+                                                    (__v8hi) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_unpacklo_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_punpcklwd128_mask ((__v8hi) __A,
+                                                    (__v8hi) __B,
+                                                    (__v8hi)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi8_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_pcmpeqb128_mask ((__v16qi) __A,
+                                                    (__v16qi) __B,
+                                                    (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epu8_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __A,
+                                                   (__v16qi) __B, 0,
+                                                   (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpeq_epu8_mask (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __A,
+                                                   (__v16qi) __B, 0,
+                                                   __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpeq_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_pcmpeqb128_mask ((__v16qi) __A,
+                                                    (__v16qi) __B,
+                                                    __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpeq_epu8_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __A,
+                                                   (__v32qi) __B, 0,
+                                                   (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpeq_epi8_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_pcmpeqb256_mask ((__v32qi) __A,
+                                                    (__v32qi) __B,
+                                                    (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpeq_epu8_mask (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __A,
+                                                   (__v32qi) __B, 0,
+                                                   __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpeq_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_pcmpeqb256_mask ((__v32qi) __A,
+                                                    (__v32qi) __B,
+                                                    __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epu16_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __A,
+                                                  (__v8hi) __B, 0,
+                                                  (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi16_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_pcmpeqw128_mask ((__v8hi) __A,
+                                                   (__v8hi) __B,
+                                                   (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpeq_epu16_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __A,
+                                                  (__v8hi) __B, 0, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpeq_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_pcmpeqw128_mask ((__v8hi) __A,
+                                                   (__v8hi) __B, __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpeq_epu16_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __A,
+                                                   (__v16hi) __B, 0,
+                                                   (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpeq_epi16_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask16) __builtin_ia32_pcmpeqw256_mask ((__v16hi) __A,
+                                                    (__v16hi) __B,
+                                                    (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpeq_epu16_mask (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __A,
+                                                   (__v16hi) __B, 0,
+                                                   __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpeq_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask16) __builtin_ia32_pcmpeqw256_mask ((__v16hi) __A,
+                                                    (__v16hi) __B,
+                                                    __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epu8_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __A,
+                                                   (__v16qi) __B, 6,
+                                                   (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi8_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_pcmpgtb128_mask ((__v16qi) __A,
+                                                    (__v16qi) __B,
+                                                    (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpgt_epu8_mask (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __A,
+                                                   (__v16qi) __B, 6,
+                                                   __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpgt_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_pcmpgtb128_mask ((__v16qi) __A,
+                                                    (__v16qi) __B,
+                                                    __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpgt_epu8_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __A,
+                                                   (__v32qi) __B, 6,
+                                                   (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpgt_epi8_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_pcmpgtb256_mask ((__v32qi) __A,
+                                                    (__v32qi) __B,
+                                                    (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpgt_epu8_mask (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __A,
+                                                   (__v32qi) __B, 6,
+                                                   __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpgt_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_pcmpgtb256_mask ((__v32qi) __A,
+                                                    (__v32qi) __B,
+                                                    __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epu16_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __A,
+                                                  (__v8hi) __B, 6,
+                                                  (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi16_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_pcmpgtw128_mask ((__v8hi) __A,
+                                                   (__v8hi) __B,
+                                                   (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpgt_epu16_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __A,
+                                                  (__v8hi) __B, 6, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpgt_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_pcmpgtw128_mask ((__v8hi) __A,
+                                                   (__v8hi) __B, __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpgt_epu16_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __A,
+                                                   (__v16hi) __B, 6,
+                                                   (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpgt_epi16_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask16) __builtin_ia32_pcmpgtw256_mask ((__v16hi) __A,
+                                                    (__v16hi) __B,
+                                                    (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpgt_epu16_mask (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __A,
+                                                   (__v16hi) __B, 6,
+                                                   __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpgt_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask16) __builtin_ia32_pcmpgtw256_mask ((__v16hi) __A,
+                                                    (__v16hi) __B,
+                                                    __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testn_epi8_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestnmb128 ((__v16qi) __A,
+                                                (__v16qi) __B,
+                                                (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_testn_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestnmb128 ((__v16qi) __A,
+                                                (__v16qi) __B, __U);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testn_epi8_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestnmb256 ((__v32qi) __A,
+                                                (__v32qi) __B,
+                                                (__mmask32) -1);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_testn_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask32) __builtin_ia32_ptestnmb256 ((__v32qi) __A,
+                                                (__v32qi) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testn_epi16_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmw128 ((__v8hi) __A,
+                                               (__v8hi) __B,
+                                               (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_testn_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmw128 ((__v8hi) __A,
+                                               (__v8hi) __B, __U);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testn_epi16_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestnmw256 ((__v16hi) __A,
+                                                (__v16hi) __B,
+                                                (__mmask16) -1);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_testn_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask16) __builtin_ia32_ptestnmw256 ((__v16hi) __A,
+                                                (__v16hi) __B, __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shuffle_epi8 (__m256i __W, __mmask32 __U, __m256i __A,
+                         __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pshufb256_mask ((__v32qi) __A,
+                                                 (__v32qi) __B,
+                                                 (__v32qi) __W,
+                                                 (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shuffle_epi8 (__mmask32 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pshufb256_mask ((__v32qi) __A,
+                                                 (__v32qi) __B,
+                                                 (__v32qi)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask32) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shuffle_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
+                      __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pshufb128_mask ((__v16qi) __A,
+                                                 (__v16qi) __B,
+                                                 (__v16qi) __W,
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shuffle_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pshufb128_mask ((__v16qi) __A,
+                                                 (__v16qi) __B,
+                                                 (__v16qi)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_packs_epi16 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_packsswb256_mask ((__v16hi) __A,
+                                                   (__v16hi) __B,
+                                                   (__v32qi)
+                                                   _mm256_setzero_si256 (),
+                                                   __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_packs_epi16 (__m256i __W, __mmask32 __M, __m256i __A,
+                        __m256i __B)
+{
+  return (__m256i) __builtin_ia32_packsswb256_mask ((__v16hi) __A,
+                                                   (__v16hi) __B,
+                                                   (__v32qi) __W,
+                                                   __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_packs_epi16 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_packsswb128_mask ((__v8hi) __A,
+                                                   (__v8hi) __B,
+                                                   (__v16qi)
+                                                   _mm_setzero_si128 (),
+                                                   __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_packs_epi16 (__m128i __W, __mmask16 __M, __m128i __A,
+                     __m128i __B)
+{
+  return (__m128i) __builtin_ia32_packsswb128_mask ((__v8hi) __A,
+                                                   (__v8hi) __B,
+                                                   (__v16qi) __W,
+                                                   __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_packus_epi16 (__mmask32 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_packuswb256_mask ((__v16hi) __A,
+                                                   (__v16hi) __B,
+                                                   (__v32qi)
+                                                   _mm256_setzero_si256 (),
+                                                   __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_packus_epi16 (__m256i __W, __mmask32 __M, __m256i __A,
+                         __m256i __B)
+{
+  return (__m256i) __builtin_ia32_packuswb256_mask ((__v16hi) __A,
+                                                   (__v16hi) __B,
+                                                   (__v32qi) __W,
+                                                   __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_packus_epi16 (__mmask16 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_packuswb128_mask ((__v8hi) __A,
+                                                   (__v8hi) __B,
+                                                   (__v16qi)
+                                                   _mm_setzero_si128 (),
+                                                   __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_packus_epi16 (__m128i __W, __mmask16 __M, __m128i __A,
+                      __m128i __B)
+{
+  return (__m128i) __builtin_ia32_packuswb128_mask ((__v8hi) __A,
+                                                   (__v8hi) __B,
+                                                   (__v16qi) __W,
+                                                   __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_abs_epi8 (__m256i __W, __mmask32 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_pabsb256_mask ((__v32qi) __A,
+                                                (__v32qi) __W,
+                                                (__mmask32) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_abs_epi8 (__mmask32 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_pabsb256_mask ((__v32qi) __A,
+                                                (__v32qi)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask32) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_abs_epi8 (__m128i __W, __mmask16 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pabsb128_mask ((__v16qi) __A,
+                                                (__v16qi) __W,
+                                                (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_abs_epi8 (__mmask16 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pabsb128_mask ((__v16qi) __A,
+                                                (__v16qi)
+                                                _mm_setzero_si128 (),
+                                                (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_abs_epi16 (__m256i __W, __mmask16 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_pabsw256_mask ((__v16hi) __A,
+                                                (__v16hi) __W,
+                                                (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_abs_epi16 (__mmask16 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_pabsw256_mask ((__v16hi) __A,
+                                                (__v16hi)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_abs_epi16 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pabsw128_mask ((__v8hi) __A,
+                                                (__v8hi) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_abs_epi16 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pabsw128_mask ((__v8hi) __A,
+                                                (__v8hi)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __mmask32
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpneq_epu8_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
+                                                  (__v32qi) __Y, 4,
+                                                  (__mmask32) -1);
+}
+
+extern __inline __mmask32
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmplt_epu8_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
+                                                  (__v32qi) __Y, 1,
+                                                  (__mmask32) -1);
+}
+
+extern __inline __mmask32
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpge_epu8_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
+                                                  (__v32qi) __Y, 5,
+                                                  (__mmask32) -1);
+}
+
+extern __inline __mmask32
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmple_epu8_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
+                                                  (__v32qi) __Y, 2,
+                                                  (__mmask32) -1);
+}
+
+extern __inline __mmask16
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpneq_epu16_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
+                                                  (__v16hi) __Y, 4,
+                                                  (__mmask16) -1);
+}
+
+extern __inline __mmask16
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmplt_epu16_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
+                                                  (__v16hi) __Y, 1,
+                                                  (__mmask16) -1);
+}
+
+extern __inline __mmask16
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpge_epu16_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
+                                                  (__v16hi) __Y, 5,
+                                                  (__mmask16) -1);
+}
+
+extern __inline __mmask16
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmple_epu16_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
+                                                  (__v16hi) __Y, 2,
+                                                  (__mmask16) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu_epi16 (void *__P, __m256i __A)
+{
+  *(__v16hi_u *) __P = (__v16hi_u) __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_storeu_epi16 (void *__P, __mmask16 __U, __m256i __A)
+{
+  __builtin_ia32_storedquhi256_mask ((short *) __P,
+                                    (__v16hi) __A,
+                                    (__mmask16) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_epi16 (void *__P, __m128i __A)
+{
+  *(__v8hi_u *) __P = (__v8hi_u) __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_storeu_epi16 (void *__P, __mmask8 __U, __m128i __A)
+{
+  __builtin_ia32_storedquhi128_mask ((short *) __P,
+                                    (__v8hi) __A,
+                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_adds_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+                    __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddsw128_mask ((__v8hi) __A,
+                                                 (__v8hi) __B,
+                                                 (__v8hi) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_subs_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubsb128_mask ((__v16qi) __A,
+                                                 (__v16qi) __B,
+                                                 (__v16qi) __W,
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_subs_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubsb128_mask ((__v16qi) __A,
+                                                 (__v16qi) __B,
+                                                 (__v16qi)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_subs_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+                    __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubsw128_mask ((__v8hi) __A,
+                                                 (__v8hi) __B,
+                                                 (__v8hi) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_subs_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubsw128_mask ((__v8hi) __A,
+                                                 (__v8hi) __B,
+                                                 (__v8hi)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_subs_epu8 (__m128i __W, __mmask16 __U, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubusb128_mask ((__v16qi) __A,
+                                                  (__v16qi) __B,
+                                                  (__v16qi) __W,
+                                                  (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_subs_epu8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubusb128_mask ((__v16qi) __A,
+                                                  (__v16qi) __B,
+                                                  (__v16qi)
+                                                  _mm_setzero_si128 (),
+                                                  (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_subs_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
+                    __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubusw128_mask ((__v8hi) __A,
+                                                  (__v8hi) __B,
+                                                  (__v8hi) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_subs_epu16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubusw128_mask ((__v8hi) __A,
+                                                  (__v8hi) __B,
+                                                  (__v8hi)
+                                                  _mm_setzero_si128 (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_srl_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+                      __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psrlw256_mask ((__v16hi) __A,
+                                                (__v8hi) __B,
+                                                (__v16hi) __W,
+                                                (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_srl_epi16 (__mmask16 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psrlw256_mask ((__v16hi) __A,
+                                                (__v8hi) __B,
+                                                (__v16hi)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_srl_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psrlw128_mask ((__v8hi) __A,
+                                                (__v8hi) __B,
+                                                (__v8hi) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_srl_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psrlw128_mask ((__v8hi) __A,
+                                                (__v8hi) __B,
+                                                (__v8hi)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sra_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+                      __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psraw256_mask ((__v16hi) __A,
+                                                (__v8hi) __B,
+                                                (__v16hi) __W,
+                                                (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sra_epi16 (__mmask16 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psraw256_mask ((__v16hi) __A,
+                                                (__v8hi) __B,
+                                                (__v16hi)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sra_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psraw128_mask ((__v8hi) __A,
+                                                (__v8hi) __B,
+                                                (__v8hi) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sra_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psraw128_mask ((__v8hi) __A,
+                                                (__v8hi) __B,
+                                                (__v8hi)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_adds_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddsw128_mask ((__v8hi) __A,
+                                                 (__v8hi) __B,
+                                                 (__v8hi)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_adds_epu8 (__m128i __W, __mmask16 __U, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddusb128_mask ((__v16qi) __A,
+                                                  (__v16qi) __B,
+                                                  (__v16qi) __W,
+                                                  (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_adds_epu8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddusb128_mask ((__v16qi) __A,
+                                                  (__v16qi) __B,
+                                                  (__v16qi)
+                                                  _mm_setzero_si128 (),
+                                                  (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_adds_epu16 (__m128i __W, __mmask8 __U, __m128i __A,
+                    __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddusw128_mask ((__v8hi) __A,
+                                                  (__v8hi) __B,
+                                                  (__v8hi) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_adds_epu16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddusw128_mask ((__v8hi) __A,
+                                                  (__v8hi) __B,
+                                                  (__v8hi)
+                                                  _mm_setzero_si128 (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sub_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
+                  __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubb128_mask ((__v16qi) __A,
+                                                (__v16qi) __B,
+                                                (__v16qi) __W,
+                                                (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sub_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubb128_mask ((__v16qi) __A,
+                                                (__v16qi) __B,
+                                                (__v16qi)
+                                                _mm_setzero_si128 (),
+                                                (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sub_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubw128_mask ((__v8hi) __A,
+                                                (__v8hi) __B,
+                                                (__v8hi) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sub_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubw128_mask ((__v8hi) __A,
+                                                (__v8hi) __B,
+                                                (__v8hi)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_adds_epi8 (__m128i __W, __mmask16 __U, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddsb128_mask ((__v16qi) __A,
+                                                 (__v16qi) __B,
+                                                 (__v16qi) __W,
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_adds_epi8 (__mmask16 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddsb128_mask ((__v16qi) __A,
+                                                 (__v16qi) __B,
+                                                 (__v16qi)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi16_epi8 (__m128i __A)
+{
+
+  return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
+                                                 (__v16qi)_mm_undefined_si128(),
+                                                 (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi16_storeu_epi8 (void * __P, __mmask8 __M,__m128i __A)
+{
+  __builtin_ia32_pmovwb128mem_mask ((unsigned long long *) __P , (__v8hi) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
+                                                 (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi16_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A,
+                                                 (__v16qi)
+                                                 _mm_setzero_si128 (),
+                                                 __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srav_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psrav16hi_mask ((__v16hi) __A,
+                                                 (__v16hi) __B,
+                                                 (__v16hi)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask16) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_srav_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+                       __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psrav16hi_mask ((__v16hi) __A,
+                                                 (__v16hi) __B,
+                                                 (__v16hi) __W,
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_srav_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psrav16hi_mask ((__v16hi) __A,
+                                                 (__v16hi) __B,
+                                                 (__v16hi)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srav_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psrav8hi_mask ((__v8hi) __A,
+                                                (__v8hi) __B,
+                                                (__v8hi)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_srav_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+                    __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psrav8hi_mask ((__v8hi) __A,
+                                                (__v8hi) __B,
+                                                (__v8hi) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_srav_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psrav8hi_mask ((__v8hi) __A,
+                                                (__v8hi) __B,
+                                                (__v8hi)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srlv_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psrlv16hi_mask ((__v16hi) __A,
+                                                 (__v16hi) __B,
+                                                 (__v16hi)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask16) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_srlv_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+                       __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psrlv16hi_mask ((__v16hi) __A,
+                                                 (__v16hi) __B,
+                                                 (__v16hi) __W,
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_srlv_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psrlv16hi_mask ((__v16hi) __A,
+                                                 (__v16hi) __B,
+                                                 (__v16hi)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srlv_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psrlv8hi_mask ((__v8hi) __A,
+                                                (__v8hi) __B,
+                                                (__v8hi)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_srlv_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+                    __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psrlv8hi_mask ((__v8hi) __A,
+                                                (__v8hi) __B,
+                                                (__v8hi) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_srlv_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psrlv8hi_mask ((__v8hi) __A,
+                                                (__v8hi) __B,
+                                                (__v8hi)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sllv_epi16 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psllv16hi_mask ((__v16hi) __A,
+                                                 (__v16hi) __B,
+                                                 (__v16hi)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask16) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sllv_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+                       __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psllv16hi_mask ((__v16hi) __A,
+                                                 (__v16hi) __B,
+                                                 (__v16hi) __W,
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sllv_epi16 (__mmask16 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psllv16hi_mask ((__v16hi) __A,
+                                                 (__v16hi) __B,
+                                                 (__v16hi)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sllv_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psllv8hi_mask ((__v8hi) __A,
+                                                (__v8hi) __B,
+                                                (__v8hi)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sllv_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+                    __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psllv8hi_mask ((__v8hi) __A,
+                                                (__v8hi) __B,
+                                                (__v8hi) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sllv_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psllv8hi_mask ((__v8hi) __A,
+                                                (__v8hi) __B,
+                                                (__v8hi)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sll_epi16 (__m128i __W, __mmask8 __U, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psllw128_mask ((__v8hi) __A,
+                                                (__v8hi) __B,
+                                                (__v8hi) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sll_epi16 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psllw128_mask ((__v8hi) __A,
+                                                (__v8hi) __B,
+                                                (__v8hi)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sll_epi16 (__m256i __W, __mmask16 __U, __m256i __A,
+                      __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psllw256_mask ((__v16hi) __A,
+                                                (__v8hi) __B,
+                                                (__v16hi) __W,
+                                                (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sll_epi16 (__mmask16 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psllw256_mask ((__v16hi) __A,
+                                                (__v8hi) __B,
+                                                (__v16hi)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_packus_epi32 (__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_packusdw256_mask ((__v8si) __A,
+                                                   (__v8si) __B,
+                                                   (__v16hi)
+                                                   _mm256_setzero_si256 (),
+                                                   __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_packus_epi32 (__m256i __W, __mmask16 __M, __m256i __A,
+                         __m256i __B)
+{
+  return (__m256i) __builtin_ia32_packusdw256_mask ((__v8si) __A,
+                                                   (__v8si) __B,
+                                                   (__v16hi) __W,
+                                                   __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_packus_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_packusdw128_mask ((__v4si) __A,
+                                                   (__v4si) __B,
+                                                   (__v8hi)
+                                                   _mm_setzero_si128 (),
+                                                   __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_packus_epi32 (__m128i __W, __mmask8 __M, __m128i __A,
+                      __m128i __B)
+{
+  return (__m128i) __builtin_ia32_packusdw128_mask ((__v4si) __A,
+                                                   (__v4si) __B,
+                                                   (__v8hi) __W, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_packs_epi32 (__mmask16 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_packssdw256_mask ((__v8si) __A,
+                                                   (__v8si) __B,
+                                                   (__v16hi)
+                                                   _mm256_setzero_si256 (),
+                                                   __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_packs_epi32 (__m256i __W, __mmask16 __M, __m256i __A,
+                        __m256i __B)
+{
+  return (__m256i) __builtin_ia32_packssdw256_mask ((__v8si) __A,
+                                                   (__v8si) __B,
+                                                   (__v16hi) __W,
+                                                   __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_packs_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_packssdw128_mask ((__v4si) __A,
+                                                   (__v4si) __B,
+                                                   (__v8hi)
+                                                   _mm_setzero_si128 (),
+                                                   __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_packs_epi32 (__m128i __W, __mmask8 __M, __m128i __A,
+                     __m128i __B)
+{
+  return (__m128i) __builtin_ia32_packssdw128_mask ((__v4si) __A,
+                                                   (__v4si) __B,
+                                                   (__v8hi) __W, __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpneq_epu8_mask (__mmask16 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
+                                                  (__v16qi) __Y, 4,
+                                                  (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmplt_epu8_mask (__mmask16 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
+                                                  (__v16qi) __Y, 1,
+                                                  (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpge_epu8_mask (__mmask16 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
+                                                  (__v16qi) __Y, 5,
+                                                  (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmple_epu8_mask (__mmask16 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X,
+                                                  (__v16qi) __Y, 2,
+                                                  (__mmask16) __M);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpneq_epu16_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
+                                                 (__v8hi) __Y, 4,
+                                                 (__mmask8) __M);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmplt_epu16_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
+                                                 (__v8hi) __Y, 1,
+                                                 (__mmask8) __M);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpge_epu16_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
+                                                 (__v8hi) __Y, 5,
+                                                 (__mmask8) __M);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmple_epu16_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X,
+                                                 (__v8hi) __Y, 2,
+                                                 (__mmask8) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpneq_epi8_mask (__mmask16 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
+                                                 (__v16qi) __Y, 4,
+                                                 (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmplt_epi8_mask (__mmask16 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
+                                                 (__v16qi) __Y, 1,
+                                                 (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpge_epi8_mask (__mmask16 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
+                                                 (__v16qi) __Y, 5,
+                                                 (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmple_epi8_mask (__mmask16 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X,
+                                                 (__v16qi) __Y, 2,
+                                                 (__mmask16) __M);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpneq_epi16_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
+                                                (__v8hi) __Y, 4,
+                                                (__mmask8) __M);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmplt_epi16_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
+                                                (__v8hi) __Y, 1,
+                                                (__mmask8) __M);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpge_epi16_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
+                                                (__v8hi) __Y, 5,
+                                                (__mmask8) __M);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmple_epi16_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X,
+                                                (__v8hi) __Y, 2,
+                                                (__mmask8) __M);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpneq_epu8_mask (__mmask32 __M, __m256i __X, __m256i __Y)
+{
+  return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
+                                                  (__v32qi) __Y, 4,
+                                                  (__mmask32) __M);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmplt_epu8_mask (__mmask32 __M, __m256i __X, __m256i __Y)
+{
+  return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
+                                                  (__v32qi) __Y, 1,
+                                                  (__mmask32) __M);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpge_epu8_mask (__mmask32 __M, __m256i __X, __m256i __Y)
+{
+  return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
+                                                  (__v32qi) __Y, 5,
+                                                  (__mmask32) __M);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmple_epu8_mask (__mmask32 __M, __m256i __X, __m256i __Y)
+{
+  return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X,
+                                                  (__v32qi) __Y, 2,
+                                                  (__mmask32) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpneq_epu16_mask (__mmask16 __M, __m256i __X, __m256i __Y)
+{
+  return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
+                                                  (__v16hi) __Y, 4,
+                                                  (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmplt_epu16_mask (__mmask16 __M, __m256i __X, __m256i __Y)
+{
+  return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
+                                                  (__v16hi) __Y, 1,
+                                                  (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpge_epu16_mask (__mmask16 __M, __m256i __X, __m256i __Y)
+{
+  return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
+                                                  (__v16hi) __Y, 5,
+                                                  (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmple_epu16_mask (__mmask16 __M, __m256i __X, __m256i __Y)
+{
+  return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X,
+                                                  (__v16hi) __Y, 2,
+                                                  (__mmask16) __M);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpneq_epi8_mask (__mmask32 __M, __m256i __X, __m256i __Y)
+{
+  return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
+                                                 (__v32qi) __Y, 4,
+                                                 (__mmask32) __M);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmplt_epi8_mask (__mmask32 __M, __m256i __X, __m256i __Y)
+{
+  return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
+                                                 (__v32qi) __Y, 1,
+                                                 (__mmask32) __M);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpge_epi8_mask (__mmask32 __M, __m256i __X, __m256i __Y)
+{
+  return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
+                                                 (__v32qi) __Y, 5,
+                                                 (__mmask32) __M);
+}
+
+extern __inline __mmask32
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmple_epi8_mask (__mmask32 __M, __m256i __X, __m256i __Y)
+{
+  return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X,
+                                                 (__v32qi) __Y, 2,
+                                                 (__mmask32) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpneq_epi16_mask (__mmask16 __M, __m256i __X, __m256i __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
+                                                 (__v16hi) __Y, 4,
+                                                 (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmplt_epi16_mask (__mmask16 __M, __m256i __X, __m256i __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
+                                                 (__v16hi) __Y, 1,
+                                                 (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpge_epi16_mask (__mmask16 __M, __m256i __X, __m256i __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
+                                                 (__v16hi) __Y, 5,
+                                                 (__mmask16) __M);
+}
+
+extern __inline __mmask16
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmple_epi16_mask (__mmask16 __M, __m256i __X, __m256i __Y)
+{
+  return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X,
+                                                 (__v16hi) __Y, 2,
+                                                 (__mmask16) __M);
+}
+
+#ifdef __DISABLE_AVX512VLBW__
+#undef __DISABLE_AVX512VLBW__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512VLBW__ */
+
+#endif /* _AVX512VLBWINTRIN_H_INCLUDED */
diff --git a/include-gcc/avx512vldqintrin.h b/include-gcc/avx512vldqintrin.h
new file mode 100644 (file)
index 0000000..be4d59c
--- /dev/null
@@ -0,0 +1,2016 @@
+/* Copyright (C) 2014-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512vldqintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512VLDQINTRIN_H_INCLUDED
+#define _AVX512VLDQINTRIN_H_INCLUDED
+
+#if !defined(__AVX512VL__) || !defined(__AVX512DQ__)
+#pragma GCC push_options
+#pragma GCC target("avx512vl,avx512dq")
+#define __DISABLE_AVX512VLDQ__
+#endif /* __AVX512VLDQ__ */
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttpd_epi64 (__m256d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
+                                                    (__v4di)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
+                                                    (__v4di) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttpd_epi64 (__mmask8 __U, __m256d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A,
+                                                    (__v4di)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttpd_epi64 (__m128d __A)
+{
+  return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
+                                                    (__v2di)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A)
+{
+  return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
+                                                    (__v2di) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttpd_epi64 (__mmask8 __U, __m128d __A)
+{
+  return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A,
+                                                    (__v2di)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttpd_epu64 (__m256d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
+                                                     (__v4di)
+                                                     _mm256_setzero_si256 (),
+                                                     (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
+                                                     (__v4di) __W,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttpd_epu64 (__mmask8 __U, __m256d __A)
+{
+  return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A,
+                                                     (__v4di)
+                                                     _mm256_setzero_si256 (),
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttpd_epu64 (__m128d __A)
+{
+  return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
+                                                     (__v2di)
+                                                     _mm_setzero_si128 (),
+                                                     (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A)
+{
+  return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
+                                                     (__v2di) __W,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttpd_epu64 (__mmask8 __U, __m128d __A)
+{
+  return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A,
+                                                     (__v2di)
+                                                     _mm_setzero_si128 (),
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtpd_epi64 (__m256d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
+                                                   (__v4di)
+                                                   _mm256_setzero_si256 (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
+                                                   (__v4di) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtpd_epi64 (__mmask8 __U, __m256d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A,
+                                                   (__v4di)
+                                                   _mm256_setzero_si256 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpd_epi64 (__m128d __A)
+{
+  return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
+                                                   (__v2di)
+                                                   _mm_setzero_si128 (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A)
+{
+  return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
+                                                   (__v2di) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtpd_epi64 (__mmask8 __U, __m128d __A)
+{
+  return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A,
+                                                   (__v2di)
+                                                   _mm_setzero_si128 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtpd_epu64 (__m256d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
+                                                    (__v4di)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
+                                                    (__v4di) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtpd_epu64 (__mmask8 __U, __m256d __A)
+{
+  return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A,
+                                                    (__v4di)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpd_epu64 (__m128d __A)
+{
+  return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
+                                                    (__v2di)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A)
+{
+  return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
+                                                    (__v2di) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtpd_epu64 (__mmask8 __U, __m128d __A)
+{
+  return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A,
+                                                    (__v2di)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttps_epi64 (__m128 __A)
+{
+  return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
+                                                    (__v4di)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttps_epi64 (__m256i __W, __mmask8 __U, __m128 __A)
+{
+  return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
+                                                    (__v4di) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A)
+{
+  return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A,
+                                                    (__v4di)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttps_epi64 (__m128 __A)
+{
+  return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
+                                                    (__v2di)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttps_epi64 (__m128i __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
+                                                    (__v2di) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A)
+{
+  return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A,
+                                                    (__v2di)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttps_epu64 (__m128 __A)
+{
+  return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
+                                                     (__v4di)
+                                                     _mm256_setzero_si256 (),
+                                                     (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttps_epu64 (__m256i __W, __mmask8 __U, __m128 __A)
+{
+  return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
+                                                     (__v4di) __W,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A)
+{
+  return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A,
+                                                     (__v4di)
+                                                     _mm256_setzero_si256 (),
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttps_epu64 (__m128 __A)
+{
+  return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
+                                                     (__v2di)
+                                                     _mm_setzero_si128 (),
+                                                     (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttps_epu64 (__m128i __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
+                                                     (__v2di) __W,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A)
+{
+  return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A,
+                                                     (__v2di)
+                                                     _mm_setzero_si128 (),
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcast_f64x2 (__m128d __A)
+{
+  return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df)
+                                                          __A,
+                                                          (__v4df)_mm256_undefined_pd(),
+                                                          (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_broadcast_f64x2 (__m256d __O, __mmask8 __M, __m128d __A)
+{
+  return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df)
+                                                          __A,
+                                                          (__v4df)
+                                                          __O, __M);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A)
+{
+  return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df)
+                                                          __A,
+                                                          (__v4df)
+                                                          _mm256_setzero_ps (),
+                                                          __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcast_i64x2 (__m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di)
+                                                          __A,
+                                                          (__v4di)_mm256_undefined_si256(),
+                                                          (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_broadcast_i64x2 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di)
+                                                          __A,
+                                                          (__v4di)
+                                                          __O, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di)
+                                                          __A,
+                                                          (__v4di)
+                                                          _mm256_setzero_si256 (),
+                                                          __M);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcast_f32x2 (__m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
+                                                         (__v8sf)_mm256_undefined_ps(),
+                                                         (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_broadcast_f32x2 (__m256 __O, __mmask8 __M, __m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
+                                                         (__v8sf) __O,
+                                                         __M);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A,
+                                                         (__v8sf)
+                                                         _mm256_setzero_ps (),
+                                                         __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcast_i32x2 (__m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si)
+                                                          __A,
+                                                         (__v8si)_mm256_undefined_si256(),
+                                                          (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_broadcast_i32x2 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si)
+                                                          __A,
+                                                          (__v8si)
+                                                          __O, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si)
+                                                          __A,
+                                                          (__v8si)
+                                                          _mm256_setzero_si256 (),
+                                                          __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcast_i32x2 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si)
+                                                          __A,
+                                                         (__v4si)_mm_undefined_si128(),
+                                                          (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_broadcast_i32x2 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si)
+                                                          __A,
+                                                          (__v4si)
+                                                          __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si)
+                                                          __A,
+                                                          (__v4si)
+                                                          _mm_setzero_si128 (),
+                                                          __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mullo_epi64 (__m256i __A, __m256i __B)
+{
+  return (__m256i) ((__v4du) __A * (__v4du) __B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mullo_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+                        __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A,
+                                                 (__v4di) __B,
+                                                 (__v4di) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mullo_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A,
+                                                 (__v4di) __B,
+                                                 (__v4di)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mullo_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v2du) __A * (__v2du) __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mullo_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+                     __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A,
+                                                 (__v2di) __B,
+                                                 (__v2di) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mullo_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A,
+                                                 (__v2di) __B,
+                                                 (__v2di)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_andnot_pd (__m256d __W, __mmask8 __U, __m256d __A,
+                      __m256d __B)
+{
+  return (__m256d) __builtin_ia32_andnpd256_mask ((__v4df) __A,
+                                                 (__v4df) __B,
+                                                 (__v4df) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_andnot_pd (__mmask8 __U, __m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_andnpd256_mask ((__v4df) __A,
+                                                 (__v4df) __B,
+                                                 (__v4df)
+                                                 _mm256_setzero_pd (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_andnot_pd (__m128d __W, __mmask8 __U, __m128d __A,
+                   __m128d __B)
+{
+  return (__m128d) __builtin_ia32_andnpd128_mask ((__v2df) __A,
+                                                 (__v2df) __B,
+                                                 (__v2df) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_andnot_pd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_andnpd128_mask ((__v2df) __A,
+                                                 (__v2df) __B,
+                                                 (__v2df)
+                                                 _mm_setzero_pd (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_andnot_ps (__m256 __W, __mmask8 __U, __m256 __A,
+                      __m256 __B)
+{
+  return (__m256) __builtin_ia32_andnps256_mask ((__v8sf) __A,
+                                                (__v8sf) __B,
+                                                (__v8sf) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_andnot_ps (__mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_andnps256_mask ((__v8sf) __A,
+                                                (__v8sf) __B,
+                                                (__v8sf)
+                                                _mm256_setzero_ps (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_andnot_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_andnps128_mask ((__v4sf) __A,
+                                                (__v4sf) __B,
+                                                (__v4sf) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_andnot_ps (__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_andnps128_mask ((__v4sf) __A,
+                                                (__v4sf) __B,
+                                                (__v4sf)
+                                                _mm_setzero_ps (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtps_epi64 (__m128 __A)
+{
+  return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
+                                                   (__v4di)
+                                                   _mm256_setzero_si256 (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtps_epi64 (__m256i __W, __mmask8 __U, __m128 __A)
+{
+  return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
+                                                   (__v4di) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A)
+{
+  return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A,
+                                                   (__v4di)
+                                                   _mm256_setzero_si256 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_epi64 (__m128 __A)
+{
+  return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
+                                                   (__v2di)
+                                                   _mm_setzero_si128 (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtps_epi64 (__m128i __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
+                                                   (__v2di) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A)
+{
+  return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A,
+                                                   (__v2di)
+                                                   _mm_setzero_si128 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtps_epu64 (__m128 __A)
+{
+  return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
+                                                    (__v4di)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtps_epu64 (__m256i __W, __mmask8 __U, __m128 __A)
+{
+  return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
+                                                    (__v4di) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A)
+{
+  return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A,
+                                                    (__v4di)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_epu64 (__m128 __A)
+{
+  return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
+                                                    (__v2di)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtps_epu64 (__m128i __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
+                                                    (__v2di) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A)
+{
+  return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A,
+                                                    (__v2di)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi64_ps (__m256i __A)
+{
+  return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
+                                                  (__v4sf)
+                                                  _mm_setzero_ps (),
+                                                  (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m256i __A)
+{
+  return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
+                                                  (__v4sf) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi64_ps (__mmask8 __U, __m256i __A)
+{
+  return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A,
+                                                  (__v4sf)
+                                                  _mm_setzero_ps (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi64_ps (__m128i __A)
+{
+  return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
+                                                  (__v4sf)
+                                                  _mm_setzero_ps (),
+                                                  (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
+                                                  (__v4sf) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi64_ps (__mmask8 __U, __m128i __A)
+{
+  return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A,
+                                                  (__v4sf)
+                                                  _mm_setzero_ps (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu64_ps (__m256i __A)
+{
+  return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
+                                                   (__v4sf)
+                                                   _mm_setzero_ps (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m256i __A)
+{
+  return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
+                                                   (__v4sf) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepu64_ps (__mmask8 __U, __m256i __A)
+{
+  return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A,
+                                                   (__v4sf)
+                                                   _mm_setzero_ps (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu64_ps (__m128i __A)
+{
+  return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
+                                                   (__v4sf)
+                                                   _mm_setzero_ps (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
+                                                   (__v4sf) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepu64_ps (__mmask8 __U, __m128i __A)
+{
+  return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A,
+                                                   (__v4sf)
+                                                   _mm_setzero_ps (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi64_pd (__m256i __A)
+{
+  return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
+                                                   (__v4df)
+                                                   _mm256_setzero_pd (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi64_pd (__m256d __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
+                                                   (__v4df) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi64_pd (__mmask8 __U, __m256i __A)
+{
+  return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A,
+                                                   (__v4df)
+                                                   _mm256_setzero_pd (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi64_pd (__m128i __A)
+{
+  return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
+                                                   (__v2df)
+                                                   _mm_setzero_pd (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi64_pd (__m128d __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
+                                                   (__v2df) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi64_pd (__mmask8 __U, __m128i __A)
+{
+  return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A,
+                                                   (__v2df)
+                                                   _mm_setzero_pd (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu64_pd (__m256i __A)
+{
+  return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
+                                                    (__v4df)
+                                                    _mm256_setzero_pd (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepu64_pd (__m256d __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
+                                                    (__v4df) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepu64_pd (__mmask8 __U, __m256i __A)
+{
+  return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A,
+                                                    (__v4df)
+                                                    _mm256_setzero_pd (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_and_pd (__m256d __W, __mmask8 __U, __m256d __A,
+                   __m256d __B)
+{
+  return (__m256d) __builtin_ia32_andpd256_mask ((__v4df) __A,
+                                                (__v4df) __B,
+                                                (__v4df) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_and_pd (__mmask8 __U, __m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_andpd256_mask ((__v4df) __A,
+                                                (__v4df) __B,
+                                                (__v4df)
+                                                _mm256_setzero_pd (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_and_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_andpd128_mask ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_and_pd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_andpd128_mask ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df)
+                                                _mm_setzero_pd (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_and_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_andps256_mask ((__v8sf) __A,
+                                               (__v8sf) __B,
+                                               (__v8sf) __W,
+                                               (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_and_ps (__mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_andps256_mask ((__v8sf) __A,
+                                               (__v8sf) __B,
+                                               (__v8sf)
+                                               _mm256_setzero_ps (),
+                                               (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_and_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_andps128_mask ((__v4sf) __A,
+                                               (__v4sf) __B,
+                                               (__v4sf) __W,
+                                               (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_and_ps (__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_andps128_mask ((__v4sf) __A,
+                                               (__v4sf) __B,
+                                               (__v4sf)
+                                               _mm_setzero_ps (),
+                                               (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu64_pd (__m128i __A)
+{
+  return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
+                                                    (__v2df)
+                                                    _mm_setzero_pd (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepu64_pd (__m128d __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
+                                                    (__v2df) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepu64_pd (__mmask8 __U, __m128i __A)
+{
+  return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A,
+                                                    (__v2df)
+                                                    _mm_setzero_pd (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_xor_pd (__m256d __W, __mmask8 __U, __m256d __A,
+                   __m256d __B)
+{
+  return (__m256d) __builtin_ia32_xorpd256_mask ((__v4df) __A,
+                                                (__v4df) __B,
+                                                (__v4df) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_xor_pd (__mmask8 __U, __m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_xorpd256_mask ((__v4df) __A,
+                                                (__v4df) __B,
+                                                (__v4df)
+                                                _mm256_setzero_pd (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_xor_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_xorpd128_mask ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_xor_pd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_xorpd128_mask ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df)
+                                                _mm_setzero_pd (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_xor_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_xorps256_mask ((__v8sf) __A,
+                                               (__v8sf) __B,
+                                               (__v8sf) __W,
+                                               (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_xor_ps (__mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_xorps256_mask ((__v8sf) __A,
+                                               (__v8sf) __B,
+                                               (__v8sf)
+                                               _mm256_setzero_ps (),
+                                               (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_xor_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_xorps128_mask ((__v4sf) __A,
+                                               (__v4sf) __B,
+                                               (__v4sf) __W,
+                                               (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_xor_ps (__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_xorps128_mask ((__v4sf) __A,
+                                               (__v4sf) __B,
+                                               (__v4sf)
+                                               _mm_setzero_ps (),
+                                               (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_or_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_orpd256_mask ((__v4df) __A,
+                                               (__v4df) __B,
+                                               (__v4df) __W,
+                                               (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_or_pd (__mmask8 __U, __m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_orpd256_mask ((__v4df) __A,
+                                               (__v4df) __B,
+                                               (__v4df)
+                                               _mm256_setzero_pd (),
+                                               (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_or_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_orpd128_mask ((__v2df) __A,
+                                               (__v2df) __B,
+                                               (__v2df) __W,
+                                               (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_or_pd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_orpd128_mask ((__v2df) __A,
+                                               (__v2df) __B,
+                                               (__v2df)
+                                               _mm_setzero_pd (),
+                                               (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_or_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_orps256_mask ((__v8sf) __A,
+                                              (__v8sf) __B,
+                                              (__v8sf) __W,
+                                              (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_or_ps (__mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_orps256_mask ((__v8sf) __A,
+                                              (__v8sf) __B,
+                                              (__v8sf)
+                                              _mm256_setzero_ps (),
+                                              (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_or_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_orps128_mask ((__v4sf) __A,
+                                              (__v4sf) __B,
+                                              (__v4sf) __W,
+                                              (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_or_ps (__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_orps128_mask ((__v4sf) __A,
+                                              (__v4sf) __B,
+                                              (__v4sf)
+                                              _mm_setzero_ps (),
+                                              (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movm_epi32 (__mmask8 __A)
+{
+  return (__m128i) __builtin_ia32_cvtmask2d128 (__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movm_epi32 (__mmask8 __A)
+{
+  return (__m256i) __builtin_ia32_cvtmask2d256 (__A);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movm_epi64 (__mmask8 __A)
+{
+  return (__m128i) __builtin_ia32_cvtmask2q128 (__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movm_epi64 (__mmask8 __A)
+{
+  return (__m256i) __builtin_ia32_cvtmask2q256 (__A);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movepi32_mask (__m128i __A)
+{
+  return (__mmask8) __builtin_ia32_cvtd2mask128 ((__v4si) __A);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movepi32_mask (__m256i __A)
+{
+  return (__mmask8) __builtin_ia32_cvtd2mask256 ((__v8si) __A);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movepi64_mask (__m128i __A)
+{
+  return (__mmask8) __builtin_ia32_cvtq2mask128 ((__v2di) __A);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movepi64_mask (__m256i __A)
+{
+  return (__mmask8) __builtin_ia32_cvtq2mask256 ((__v4di) __A);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extractf64x2_pd (__m256d __A, const int __imm)
+{
+  return (__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df) __A,
+                                                        __imm,
+                                                        (__v2df)
+                                                        _mm_setzero_pd (),
+                                                        (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_extractf64x2_pd (__m128d __W, __mmask8 __U, __m256d __A,
+                            const int __imm)
+{
+  return (__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df) __A,
+                                                        __imm,
+                                                        (__v2df) __W,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_extractf64x2_pd (__mmask8 __U, __m256d __A,
+                             const int __imm)
+{
+  return (__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df) __A,
+                                                        __imm,
+                                                        (__v2df)
+                                                        _mm_setzero_pd (),
+                                                        (__mmask8)
+                                                        __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extracti64x2_epi64 (__m256i __A, const int __imm)
+{
+  return (__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di) __A,
+                                                        __imm,
+                                                        (__v2di)
+                                                        _mm_setzero_si128 (),
+                                                        (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_extracti64x2_epi64 (__m128i __W, __mmask8 __U, __m256i __A,
+                               const int __imm)
+{
+  return (__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di) __A,
+                                                        __imm,
+                                                        (__v2di) __W,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_extracti64x2_epi64 (__mmask8 __U, __m256i __A,
+                                const int __imm)
+{
+  return (__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di) __A,
+                                                        __imm,
+                                                        (__v2di)
+                                                        _mm_setzero_si128 (),
+                                                        (__mmask8)
+                                                        __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_pd (__m256d __A, int __B)
+{
+  return (__m256d) __builtin_ia32_reducepd256_mask ((__v4df) __A, __B,
+                                                   (__v4df)
+                                                   _mm256_setzero_pd (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_reduce_pd (__m256d __W, __mmask8 __U, __m256d __A, int __B)
+{
+  return (__m256d) __builtin_ia32_reducepd256_mask ((__v4df) __A, __B,
+                                                   (__v4df) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_reduce_pd (__mmask8 __U, __m256d __A, int __B)
+{
+  return (__m256d) __builtin_ia32_reducepd256_mask ((__v4df) __A, __B,
+                                                   (__v4df)
+                                                   _mm256_setzero_pd (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_pd (__m128d __A, int __B)
+{
+  return (__m128d) __builtin_ia32_reducepd128_mask ((__v2df) __A, __B,
+                                                   (__v2df)
+                                                   _mm_setzero_pd (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_pd (__m128d __W, __mmask8 __U, __m128d __A, int __B)
+{
+  return (__m128d) __builtin_ia32_reducepd128_mask ((__v2df) __A, __B,
+                                                   (__v2df) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_reduce_pd (__mmask8 __U, __m128d __A, int __B)
+{
+  return (__m128d) __builtin_ia32_reducepd128_mask ((__v2df) __A, __B,
+                                                   (__v2df)
+                                                   _mm_setzero_pd (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_reduce_ps (__m256 __A, int __B)
+{
+  return (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B,
+                                                  (__v8sf)
+                                                  _mm256_setzero_ps (),
+                                                  (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_reduce_ps (__m256 __W, __mmask8 __U, __m256 __A, int __B)
+{
+  return (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B,
+                                                  (__v8sf) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_reduce_ps (__mmask8 __U, __m256 __A, int __B)
+{
+  return (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B,
+                                                  (__v8sf)
+                                                  _mm256_setzero_ps (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_reduce_ps (__m128 __A, int __B)
+{
+  return (__m128) __builtin_ia32_reduceps128_mask ((__v4sf) __A, __B,
+                                                  (__v4sf)
+                                                  _mm_setzero_ps (),
+                                                  (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_reduce_ps (__m128 __W, __mmask8 __U, __m128 __A, int __B)
+{
+  return (__m128) __builtin_ia32_reduceps128_mask ((__v4sf) __A, __B,
+                                                  (__v4sf) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_reduce_ps (__mmask8 __U, __m128 __A, int __B)
+{
+  return (__m128) __builtin_ia32_reduceps128_mask ((__v4sf) __A, __B,
+                                                  (__v4sf)
+                                                  _mm_setzero_ps (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_range_pd (__m256d __A, __m256d __B, int __C)
+{
+  return (__m256d) __builtin_ia32_rangepd256_mask ((__v4df) __A,
+                                                  (__v4df) __B, __C,
+                                                  (__v4df)
+                                                  _mm256_setzero_pd (),
+                                                  (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_range_pd (__m256d __W, __mmask8 __U,
+                     __m256d __A, __m256d __B, int __C)
+{
+  return (__m256d) __builtin_ia32_rangepd256_mask ((__v4df) __A,
+                                                  (__v4df) __B, __C,
+                                                  (__v4df) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_range_pd (__mmask8 __U, __m256d __A, __m256d __B, int __C)
+{
+  return (__m256d) __builtin_ia32_rangepd256_mask ((__v4df) __A,
+                                                  (__v4df) __B, __C,
+                                                  (__v4df)
+                                                  _mm256_setzero_pd (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_range_pd (__m128d __A, __m128d __B, int __C)
+{
+  return (__m128d) __builtin_ia32_rangepd128_mask ((__v2df) __A,
+                                                  (__v2df) __B, __C,
+                                                  (__v2df)
+                                                  _mm_setzero_pd (),
+                                                  (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_range_pd (__m128d __W, __mmask8 __U,
+                  __m128d __A, __m128d __B, int __C)
+{
+  return (__m128d) __builtin_ia32_rangepd128_mask ((__v2df) __A,
+                                                  (__v2df) __B, __C,
+                                                  (__v2df) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_range_pd (__mmask8 __U, __m128d __A, __m128d __B, int __C)
+{
+  return (__m128d) __builtin_ia32_rangepd128_mask ((__v2df) __A,
+                                                  (__v2df) __B, __C,
+                                                  (__v2df)
+                                                  _mm_setzero_pd (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_range_ps (__m256 __A, __m256 __B, int __C)
+{
+  return (__m256) __builtin_ia32_rangeps256_mask ((__v8sf) __A,
+                                                 (__v8sf) __B, __C,
+                                                 (__v8sf)
+                                                 _mm256_setzero_ps (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_range_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B,
+                     int __C)
+{
+  return (__m256) __builtin_ia32_rangeps256_mask ((__v8sf) __A,
+                                                 (__v8sf) __B, __C,
+                                                 (__v8sf) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_range_ps (__mmask8 __U, __m256 __A, __m256 __B, int __C)
+{
+  return (__m256) __builtin_ia32_rangeps256_mask ((__v8sf) __A,
+                                                 (__v8sf) __B, __C,
+                                                 (__v8sf)
+                                                 _mm256_setzero_ps (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_range_ps (__m128 __A, __m128 __B, int __C)
+{
+  return (__m128) __builtin_ia32_rangeps128_mask ((__v4sf) __A,
+                                                 (__v4sf) __B, __C,
+                                                 (__v4sf)
+                                                 _mm_setzero_ps (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_range_ps (__m128 __W, __mmask8 __U,
+                  __m128 __A, __m128 __B, int __C)
+{
+  return (__m128) __builtin_ia32_rangeps128_mask ((__v4sf) __A,
+                                                 (__v4sf) __B, __C,
+                                                 (__v4sf) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_range_ps (__mmask8 __U, __m128 __A, __m128 __B, int __C)
+{
+  return (__m128) __builtin_ia32_rangeps128_mask ((__v4sf) __A,
+                                                 (__v4sf) __B, __C,
+                                                 (__v4sf)
+                                                 _mm_setzero_ps (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fpclass_pd_mask (__mmask8 __U, __m256d __A,
+                            const int __imm)
+{
+  return (__mmask8) __builtin_ia32_fpclasspd256_mask ((__v4df) __A,
+                                                     __imm, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fpclass_pd_mask (__m256d __A, const int __imm)
+{
+  return (__mmask8) __builtin_ia32_fpclasspd256_mask ((__v4df) __A,
+                                                     __imm,
+                                                     (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fpclass_ps_mask (__mmask8 __U, __m256 __A, const int __imm)
+{
+  return (__mmask8) __builtin_ia32_fpclassps256_mask ((__v8sf) __A,
+                                                     __imm, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fpclass_ps_mask (__m256 __A, const int __imm)
+{
+  return (__mmask8) __builtin_ia32_fpclassps256_mask ((__v8sf) __A,
+                                                     __imm,
+                                                     (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fpclass_pd_mask (__mmask8 __U, __m128d __A, const int __imm)
+{
+  return (__mmask8) __builtin_ia32_fpclasspd128_mask ((__v2df) __A,
+                                                     __imm, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fpclass_pd_mask (__m128d __A, const int __imm)
+{
+  return (__mmask8) __builtin_ia32_fpclasspd128_mask ((__v2df) __A,
+                                                     __imm,
+                                                     (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fpclass_ps_mask (__mmask8 __U, __m128 __A, const int __imm)
+{
+  return (__mmask8) __builtin_ia32_fpclassps128_mask ((__v4sf) __A,
+                                                     __imm, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fpclass_ps_mask (__m128 __A, const int __imm)
+{
+  return (__mmask8) __builtin_ia32_fpclassps128_mask ((__v4sf) __A,
+                                                     __imm,
+                                                     (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_inserti64x2 (__m256i __A, __m128i __B, const int __imm)
+{
+  return (__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di) __A,
+                                                       (__v2di) __B,
+                                                       __imm,
+                                                       (__v4di)
+                                                       _mm256_setzero_si256 (),
+                                                       (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_inserti64x2 (__m256i __W, __mmask8 __U, __m256i __A,
+                        __m128i __B, const int __imm)
+{
+  return (__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di) __A,
+                                                       (__v2di) __B,
+                                                       __imm,
+                                                       (__v4di) __W,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_inserti64x2 (__mmask8 __U, __m256i __A, __m128i __B,
+                         const int __imm)
+{
+  return (__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di) __A,
+                                                       (__v2di) __B,
+                                                       __imm,
+                                                       (__v4di)
+                                                       _mm256_setzero_si256 (),
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_insertf64x2 (__m256d __A, __m128d __B, const int __imm)
+{
+  return (__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df) __A,
+                                                       (__v2df) __B,
+                                                       __imm,
+                                                       (__v4df)
+                                                       _mm256_setzero_pd (),
+                                                       (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_insertf64x2 (__m256d __W, __mmask8 __U, __m256d __A,
+                        __m128d __B, const int __imm)
+{
+  return (__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df) __A,
+                                                       (__v2df) __B,
+                                                       __imm,
+                                                       (__v4df) __W,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_insertf64x2 (__mmask8 __U, __m256d __A, __m128d __B,
+                         const int __imm)
+{
+  return (__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df) __A,
+                                                       (__v2df) __B,
+                                                       __imm,
+                                                       (__v4df)
+                                                       _mm256_setzero_pd (),
+                                                       (__mmask8)
+                                                       __U);
+}
+
+#else
+#define _mm256_insertf64x2(X, Y, C)                                     \
+  ((__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df)(__m256d) (X),\
+    (__v2df)(__m128d) (Y), (int) (C),                                  \
+    (__v4df)(__m256d)_mm256_setzero_pd(),                              \
+    (__mmask8)-1))
+
+#define _mm256_mask_insertf64x2(W, U, X, Y, C)                          \
+  ((__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df)(__m256d) (X),\
+    (__v2df)(__m128d) (Y), (int) (C),                                  \
+    (__v4df)(__m256d)(W),                                              \
+    (__mmask8)(U)))
+
+#define _mm256_maskz_insertf64x2(U, X, Y, C)                           \
+  ((__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df)(__m256d) (X),\
+    (__v2df)(__m128d) (Y), (int) (C),                                  \
+    (__v4df)(__m256d)_mm256_setzero_pd(),                              \
+    (__mmask8)(U)))
+
+#define _mm256_inserti64x2(X, Y, C)                                     \
+  ((__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di)(__m256i) (X),\
+    (__v2di)(__m128i) (Y), (int) (C),                                  \
+    (__v4di)(__m256i)_mm256_setzero_si256 (),                          \
+    (__mmask8)-1))
+
+#define _mm256_mask_inserti64x2(W, U, X, Y, C)                          \
+  ((__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di)(__m256i) (X),\
+    (__v2di)(__m128i) (Y), (int) (C),                                  \
+    (__v4di)(__m256i)(W),                                              \
+    (__mmask8)(U)))
+
+#define _mm256_maskz_inserti64x2(U, X, Y, C)                            \
+  ((__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di)(__m256i) (X),\
+    (__v2di)(__m128i) (Y), (int) (C),                                  \
+    (__v4di)(__m256i)_mm256_setzero_si256 (),                          \
+    (__mmask8)(U)))
+
+#define _mm256_extractf64x2_pd(X, C)                                    \
+  ((__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df)(__m256d) (X),\
+    (int) (C), (__v2df)(__m128d) _mm_setzero_pd(), (__mmask8)-1))
+
+#define _mm256_mask_extractf64x2_pd(W, U, X, C)                         \
+  ((__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df)(__m256d) (X),\
+    (int) (C), (__v2df)(__m128d) (W), (__mmask8) (U)))
+
+#define _mm256_maskz_extractf64x2_pd(U, X, C)                           \
+  ((__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df)(__m256d) (X),\
+    (int) (C), (__v2df)(__m128d) _mm_setzero_pd(), (__mmask8) (U)))
+
+#define _mm256_extracti64x2_epi64(X, C)                                 \
+  ((__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di)(__m256i) (X),\
+    (int) (C), (__v2di)(__m128i) _mm_setzero_si128 (), (__mmask8)-1))
+
+#define _mm256_mask_extracti64x2_epi64(W, U, X, C)                     \
+  ((__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di)(__m256i) (X),\
+    (int) (C), (__v2di)(__m128i) (W), (__mmask8) (U)))
+
+#define _mm256_maskz_extracti64x2_epi64(U, X, C)                        \
+  ((__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di)(__m256i) (X),\
+    (int) (C), (__v2di)(__m128i) _mm_setzero_si128 (), (__mmask8) (U)))
+
+#define _mm256_reduce_pd(A, B)                                         \
+  ((__m256d) __builtin_ia32_reducepd256_mask ((__v4df)(__m256d)(A),    \
+    (int)(B), (__v4df)_mm256_setzero_pd(), (__mmask8)-1))
+
+#define _mm256_mask_reduce_pd(W, U, A, B)                              \
+  ((__m256d) __builtin_ia32_reducepd256_mask ((__v4df)(__m256d)(A),    \
+    (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U)))
+
+#define _mm256_maskz_reduce_pd(U, A, B)                                        \
+  ((__m256d) __builtin_ia32_reducepd256_mask ((__v4df)(__m256d)(A),    \
+    (int)(B), (__v4df)_mm256_setzero_pd(), (__mmask8)(U)))
+
+#define _mm_reduce_pd(A, B)                                            \
+  ((__m128d) __builtin_ia32_reducepd128_mask ((__v2df)(__m128d)(A),    \
+    (int)(B), (__v2df)_mm_setzero_pd(), (__mmask8)-1))
+
+#define _mm_mask_reduce_pd(W, U, A, B)                                 \
+  ((__m128d) __builtin_ia32_reducepd128_mask ((__v2df)(__m128d)(A),    \
+    (int)(B), (__v2df)(__m128d)(W), (__mmask8)(U)))
+
+#define _mm_maskz_reduce_pd(U, A, B)                                   \
+  ((__m128d) __builtin_ia32_reducepd128_mask ((__v2df)(__m128d)(A),    \
+    (int)(B), (__v2df)_mm_setzero_pd(), (__mmask8)(U)))
+
+#define _mm256_reduce_ps(A, B)                                         \
+  ((__m256) __builtin_ia32_reduceps256_mask ((__v8sf)(__m256)(A),      \
+    (int)(B), (__v8sf)_mm256_setzero_ps(), (__mmask8)-1))
+
+#define _mm256_mask_reduce_ps(W, U, A, B)                              \
+  ((__m256) __builtin_ia32_reduceps256_mask ((__v8sf)(__m256)(A),      \
+    (int)(B), (__v8sf)(__m256)(W), (__mmask8)(U)))
+
+#define _mm256_maskz_reduce_ps(U, A, B)                                        \
+  ((__m256) __builtin_ia32_reduceps256_mask ((__v8sf)(__m256)(A),      \
+    (int)(B), (__v8sf)_mm256_setzero_ps(), (__mmask8)(U)))
+
+#define _mm_reduce_ps(A, B)                                            \
+  ((__m128) __builtin_ia32_reduceps128_mask ((__v4sf)(__m128)(A),      \
+    (int)(B), (__v4sf)_mm_setzero_ps(), (__mmask8)-1))
+
+#define _mm_mask_reduce_ps(W, U, A, B)                                 \
+  ((__m128) __builtin_ia32_reduceps128_mask ((__v4sf)(__m128)(A),      \
+    (int)(B), (__v4sf)(__m128)(W), (__mmask8)(U)))
+
+#define _mm_maskz_reduce_ps(U, A, B)                                   \
+  ((__m128) __builtin_ia32_reduceps128_mask ((__v4sf)(__m128)(A),      \
+    (int)(B), (__v4sf)_mm_setzero_ps(), (__mmask8)(U)))
+
+#define _mm256_range_pd(A, B, C)                                       \
+  ((__m256d) __builtin_ia32_rangepd256_mask ((__v4df)(__m256d)(A),     \
+    (__v4df)(__m256d)(B), (int)(C),                                    \
+    (__v4df)_mm256_setzero_pd(), (__mmask8)-1))
+
+#define _mm256_maskz_range_pd(U, A, B, C)                              \
+  ((__m256d) __builtin_ia32_rangepd256_mask ((__v4df)(__m256d)(A),     \
+    (__v4df)(__m256d)(B), (int)(C),                                    \
+    (__v4df)_mm256_setzero_pd(), (__mmask8)(U)))
+
+#define _mm_range_pd(A, B, C)                                          \
+  ((__m128d) __builtin_ia32_rangepd128_mask ((__v2df)(__m128d)(A),     \
+    (__v2df)(__m128d)(B), (int)(C),                                    \
+    (__v2df)_mm_setzero_pd(), (__mmask8)-1))
+
+#define _mm256_range_ps(A, B, C)                                       \
+  ((__m256) __builtin_ia32_rangeps256_mask ((__v8sf)(__m256)(A),       \
+    (__v8sf)(__m256)(B), (int)(C),                                     \
+    (__v8sf)_mm256_setzero_ps(), (__mmask8)-1))
+
+#define _mm256_mask_range_ps(W, U, A, B, C)                            \
+  ((__m256) __builtin_ia32_rangeps256_mask ((__v8sf)(__m256)(A),       \
+    (__v8sf)(__m256)(B), (int)(C),                                     \
+    (__v8sf)(__m256)(W), (__mmask8)(U)))
+
+#define _mm256_maskz_range_ps(U, A, B, C)                              \
+  ((__m256) __builtin_ia32_rangeps256_mask ((__v8sf)(__m256)(A),       \
+    (__v8sf)(__m256)(B), (int)(C),                                     \
+    (__v8sf)_mm256_setzero_ps(), (__mmask8)(U)))
+
+#define _mm_range_ps(A, B, C)                                          \
+  ((__m128) __builtin_ia32_rangeps128_mask ((__v4sf)(__m128)(A),       \
+    (__v4sf)(__m128)(B), (int)(C),                                     \
+    (__v4sf)_mm_setzero_ps(), (__mmask8)-1))
+
+#define _mm_mask_range_ps(W, U, A, B, C)                               \
+  ((__m128) __builtin_ia32_rangeps128_mask ((__v4sf)(__m128)(A),       \
+    (__v4sf)(__m128)(B), (int)(C),                                     \
+    (__v4sf)(__m128)(W), (__mmask8)(U)))
+
+#define _mm_maskz_range_ps(U, A, B, C)                                 \
+  ((__m128) __builtin_ia32_rangeps128_mask ((__v4sf)(__m128)(A),       \
+    (__v4sf)(__m128)(B), (int)(C),                                     \
+    (__v4sf)_mm_setzero_ps(), (__mmask8)(U)))
+
+#define _mm256_mask_range_pd(W, U, A, B, C)                            \
+  ((__m256d) __builtin_ia32_rangepd256_mask ((__v4df)(__m256d)(A),     \
+    (__v4df)(__m256d)(B), (int)(C),                                    \
+    (__v4df)(__m256d)(W), (__mmask8)(U)))
+
+#define _mm_mask_range_pd(W, U, A, B, C)                               \
+  ((__m128d) __builtin_ia32_rangepd128_mask ((__v2df)(__m128d)(A),     \
+    (__v2df)(__m128d)(B), (int)(C),                                    \
+    (__v2df)(__m128d)(W), (__mmask8)(U)))
+
+#define _mm_maskz_range_pd(U, A, B, C)                                 \
+  ((__m128d) __builtin_ia32_rangepd128_mask ((__v2df)(__m128d)(A),     \
+    (__v2df)(__m128d)(B), (int)(C),                                    \
+    (__v2df)_mm_setzero_pd(), (__mmask8)(U)))
+
+#define _mm256_mask_fpclass_pd_mask(u, X, C)                            \
+  ((__mmask8) __builtin_ia32_fpclasspd256_mask ((__v4df) (__m256d) (X), \
+                                               (int) (C),(__mmask8)(u)))
+
+#define _mm256_mask_fpclass_ps_mask(u, X, C)                           \
+  ((__mmask8) __builtin_ia32_fpclassps256_mask ((__v8sf) (__m256) (X),  \
+                                               (int) (C),(__mmask8)(u)))
+
+#define _mm_mask_fpclass_pd_mask(u, X, C)                               \
+  ((__mmask8) __builtin_ia32_fpclasspd128_mask ((__v2df) (__m128d) (X), \
+                                               (int) (C),(__mmask8)(u)))
+
+#define _mm_mask_fpclass_ps_mask(u, X, C)                               \
+  ((__mmask8) __builtin_ia32_fpclassps128_mask ((__v4sf) (__m128) (X),  \
+                                               (int) (C),(__mmask8)(u)))
+
+#define _mm256_fpclass_pd_mask(X, C)                                    \
+  ((__mmask8) __builtin_ia32_fpclasspd256_mask ((__v4df) (__m256d) (X), \
+                                               (int) (C),(__mmask8)-1))
+
+#define _mm256_fpclass_ps_mask(X, C)                                    \
+  ((__mmask8) __builtin_ia32_fpclassps256_mask ((__v8sf) (__m256) (X),  \
+                                               (int) (C),(__mmask8)-1))
+
+#define _mm_fpclass_pd_mask(X, C)                                       \
+  ((__mmask8) __builtin_ia32_fpclasspd128_mask ((__v2df) (__m128d) (X), \
+                                               (int) (C),(__mmask8)-1))
+
+#define _mm_fpclass_ps_mask(X, C)                                       \
+  ((__mmask8) __builtin_ia32_fpclassps128_mask ((__v4sf) (__m128) (X),  \
+                                               (int) (C),(__mmask8)-1))
+
+#endif
+
+#ifdef __DISABLE_AVX512VLDQ__
+#undef __DISABLE_AVX512VLDQ__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512VLDQ__ */
+
+#endif /* _AVX512VLDQINTRIN_H_INCLUDED */
diff --git a/include-gcc/avx512vlintrin.h b/include-gcc/avx512vlintrin.h
new file mode 100644 (file)
index 0000000..758b71a
--- /dev/null
@@ -0,0 +1,13896 @@
+/* Copyright (C) 2014-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512vlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512VLINTRIN_H_INCLUDED
+#define _AVX512VLINTRIN_H_INCLUDED
+
+#ifndef __AVX512VL__
+#pragma GCC push_options
+#pragma GCC target("avx512vl")
+#define __DISABLE_AVX512VL__
+#endif /* __AVX512VL__ */
+
+/* Internal data types for implementing the intrinsics.  */
+typedef unsigned int __mmask32;
+typedef int __v4si_u __attribute__ ((__vector_size__ (16),     \
+                                    __may_alias__, __aligned__ (1)));
+typedef int __v8si_u __attribute__ ((__vector_size__ (32),     \
+                                    __may_alias__, __aligned__ (1)));
+typedef long long __v2di_u __attribute__ ((__vector_size__ (16),       \
+                                          __may_alias__, __aligned__ (1)));
+typedef long long __v4di_u __attribute__ ((__vector_size__ (32),       \
+                                          __may_alias__, __aligned__ (1)));
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mov_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_movapd256_mask ((__v4df) __A,
+                                                 (__v4df) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mov_pd (__mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_movapd256_mask ((__v4df) __A,
+                                                 (__v4df)
+                                                 _mm256_setzero_pd (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mov_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_movapd128_mask ((__v2df) __A,
+                                                 (__v2df) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mov_pd (__mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_movapd128_mask ((__v2df) __A,
+                                                 (__v2df)
+                                                 _mm_setzero_pd (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_load_pd (__m256d __W, __mmask8 __U, void const *__P)
+{
+  return (__m256d) __builtin_ia32_loadapd256_mask ((__v4df *) __P,
+                                                  (__v4df) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_load_pd (__mmask8 __U, void const *__P)
+{
+  return (__m256d) __builtin_ia32_loadapd256_mask ((__v4df *) __P,
+                                                  (__v4df)
+                                                  _mm256_setzero_pd (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_load_pd (__m128d __W, __mmask8 __U, void const *__P)
+{
+  return (__m128d) __builtin_ia32_loadapd128_mask ((__v2df *) __P,
+                                                  (__v2df) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_load_pd (__mmask8 __U, void const *__P)
+{
+  return (__m128d) __builtin_ia32_loadapd128_mask ((__v2df *) __P,
+                                                  (__v2df)
+                                                  _mm_setzero_pd (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_store_pd (void *__P, __mmask8 __U, __m256d __A)
+{
+  __builtin_ia32_storeapd256_mask ((__v4df *) __P,
+                                  (__v4df) __A,
+                                  (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_store_pd (void *__P, __mmask8 __U, __m128d __A)
+{
+  __builtin_ia32_storeapd128_mask ((__v2df *) __P,
+                                  (__v2df) __A,
+                                  (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mov_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_movaps256_mask ((__v8sf) __A,
+                                                (__v8sf) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mov_ps (__mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_movaps256_mask ((__v8sf) __A,
+                                                (__v8sf)
+                                                _mm256_setzero_ps (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mov_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_movaps128_mask ((__v4sf) __A,
+                                                (__v4sf) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mov_ps (__mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_movaps128_mask ((__v4sf) __A,
+                                                (__v4sf)
+                                                _mm_setzero_ps (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_load_ps (__m256 __W, __mmask8 __U, void const *__P)
+{
+  return (__m256) __builtin_ia32_loadaps256_mask ((__v8sf *) __P,
+                                                 (__v8sf) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_load_ps (__mmask8 __U, void const *__P)
+{
+  return (__m256) __builtin_ia32_loadaps256_mask ((__v8sf *) __P,
+                                                 (__v8sf)
+                                                 _mm256_setzero_ps (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_load_ps (__m128 __W, __mmask8 __U, void const *__P)
+{
+  return (__m128) __builtin_ia32_loadaps128_mask ((__v4sf *) __P,
+                                                 (__v4sf) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_load_ps (__mmask8 __U, void const *__P)
+{
+  return (__m128) __builtin_ia32_loadaps128_mask ((__v4sf *) __P,
+                                                 (__v4sf)
+                                                 _mm_setzero_ps (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_store_ps (void *__P, __mmask8 __U, __m256 __A)
+{
+  __builtin_ia32_storeaps256_mask ((__v8sf *) __P,
+                                  (__v8sf) __A,
+                                  (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_store_ps (void *__P, __mmask8 __U, __m128 __A)
+{
+  __builtin_ia32_storeaps128_mask ((__v4sf *) __P,
+                                  (__v4sf) __A,
+                                  (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mov_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_movdqa64_256_mask ((__v4di) __A,
+                                                    (__v4di) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mov_epi64 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_movdqa64_256_mask ((__v4di) __A,
+                                                    (__v4di)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mov_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_movdqa64_128_mask ((__v2di) __A,
+                                                    (__v2di) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mov_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_movdqa64_128_mask ((__v2di) __A,
+                                                    (__v2di)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_load_epi64 (void const *__P)
+{
+  return (__m256i) (*(__v4di *) __P);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_load_epi64 (__m256i __W, __mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_movdqa64load256_mask ((__v4di *) __P,
+                                                       (__v4di) __W,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_load_epi64 (__mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_movdqa64load256_mask ((__v4di *) __P,
+                                                       (__v4di)
+                                                       _mm256_setzero_si256 (),
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_epi64 (void const *__P)
+{
+  return (__m128i) (*(__v2di *) __P);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_load_epi64 (__m128i __W, __mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_movdqa64load128_mask ((__v2di *) __P,
+                                                       (__v2di) __W,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_load_epi64 (__mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_movdqa64load128_mask ((__v2di *) __P,
+                                                       (__v2di)
+                                                       _mm_setzero_si128 (),
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_store_epi64 (void *__P, __mmask8 __U, __m256i __A)
+{
+  __builtin_ia32_movdqa64store256_mask ((__v4di *) __P,
+                                       (__v4di) __A,
+                                       (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_store_epi64 (void *__P, __mmask8 __U, __m128i __A)
+{
+  __builtin_ia32_movdqa64store128_mask ((__v2di *) __P,
+                                       (__v2di) __A,
+                                       (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mov_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_movdqa32_256_mask ((__v8si) __A,
+                                                    (__v8si) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mov_epi32 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_movdqa32_256_mask ((__v8si) __A,
+                                                    (__v8si)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mov_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_movdqa32_128_mask ((__v4si) __A,
+                                                    (__v4si) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mov_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_movdqa32_128_mask ((__v4si) __A,
+                                                    (__v4si)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_load_epi32 (void const *__P)
+{
+  return (__m256i) (*(__v8si *) __P);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_load_epi32 (__m256i __W, __mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_movdqa32load256_mask ((__v8si *) __P,
+                                                       (__v8si) __W,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_load_epi32 (__mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_movdqa32load256_mask ((__v8si *) __P,
+                                                       (__v8si)
+                                                       _mm256_setzero_si256 (),
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_epi32 (void const *__P)
+{
+  return (__m128i) (*(__v4si *) __P);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_load_epi32 (__m128i __W, __mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_movdqa32load128_mask ((__v4si *) __P,
+                                                       (__v4si) __W,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_load_epi32 (__mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_movdqa32load128_mask ((__v4si *) __P,
+                                                       (__v4si)
+                                                       _mm_setzero_si128 (),
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_store_epi32 (void *__P, __m256i __A)
+{
+  *(__v8si *) __P = (__v8si) __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_store_epi32 (void *__P, __mmask8 __U, __m256i __A)
+{
+  __builtin_ia32_movdqa32store256_mask ((__v8si *) __P,
+                                       (__v8si) __A,
+                                       (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_epi32 (void *__P, __m128i __A)
+{
+  *(__v4si *) __P = (__v4si) __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_store_epi32 (void *__P, __mmask8 __U, __m128i __A)
+{
+  __builtin_ia32_movdqa32store128_mask ((__v4si *) __P,
+                                       (__v4si) __A,
+                                       (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_add_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_addpd128_mask ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_add_pd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_addpd128_mask ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df)
+                                                _mm_setzero_pd (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_add_pd (__m256d __W, __mmask8 __U, __m256d __A,
+                   __m256d __B)
+{
+  return (__m256d) __builtin_ia32_addpd256_mask ((__v4df) __A,
+                                                (__v4df) __B,
+                                                (__v4df) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_add_pd (__mmask8 __U, __m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_addpd256_mask ((__v4df) __A,
+                                                (__v4df) __B,
+                                                (__v4df)
+                                                _mm256_setzero_pd (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_add_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_addps128_mask ((__v4sf) __A,
+                                               (__v4sf) __B,
+                                               (__v4sf) __W,
+                                               (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_add_ps (__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_addps128_mask ((__v4sf) __A,
+                                               (__v4sf) __B,
+                                               (__v4sf)
+                                               _mm_setzero_ps (),
+                                               (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_add_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A,
+                                               (__v8sf) __B,
+                                               (__v8sf) __W,
+                                               (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_add_ps (__mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A,
+                                               (__v8sf) __B,
+                                               (__v8sf)
+                                               _mm256_setzero_ps (),
+                                               (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sub_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_subpd128_mask ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sub_pd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_subpd128_mask ((__v2df) __A,
+                                                (__v2df) __B,
+                                                (__v2df)
+                                                _mm_setzero_pd (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sub_pd (__m256d __W, __mmask8 __U, __m256d __A,
+                   __m256d __B)
+{
+  return (__m256d) __builtin_ia32_subpd256_mask ((__v4df) __A,
+                                                (__v4df) __B,
+                                                (__v4df) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sub_pd (__mmask8 __U, __m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_subpd256_mask ((__v4df) __A,
+                                                (__v4df) __B,
+                                                (__v4df)
+                                                _mm256_setzero_pd (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sub_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_subps128_mask ((__v4sf) __A,
+                                               (__v4sf) __B,
+                                               (__v4sf) __W,
+                                               (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sub_ps (__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_subps128_mask ((__v4sf) __A,
+                                               (__v4sf) __B,
+                                               (__v4sf)
+                                               _mm_setzero_ps (),
+                                               (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sub_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A,
+                                               (__v8sf) __B,
+                                               (__v8sf) __W,
+                                               (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sub_ps (__mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A,
+                                               (__v8sf) __B,
+                                               (__v8sf)
+                                               _mm256_setzero_ps (),
+                                               (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_store_epi64 (void *__P, __m256i __A)
+{
+  *(__m256i *) __P = __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_epi64 (void *__P, __m128i __A)
+{
+  *(__m128i *) __P = __A;
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_loadu_pd (__m256d __W, __mmask8 __U, void const *__P)
+{
+  return (__m256d) __builtin_ia32_loadupd256_mask ((const double *) __P,
+                                                  (__v4df) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_loadu_pd (__mmask8 __U, void const *__P)
+{
+  return (__m256d) __builtin_ia32_loadupd256_mask ((const double *) __P,
+                                                  (__v4df)
+                                                  _mm256_setzero_pd (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_loadu_pd (__m128d __W, __mmask8 __U, void const *__P)
+{
+  return (__m128d) __builtin_ia32_loadupd128_mask ((const double *) __P,
+                                                  (__v2df) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_loadu_pd (__mmask8 __U, void const *__P)
+{
+  return (__m128d) __builtin_ia32_loadupd128_mask ((const double *) __P,
+                                                  (__v2df)
+                                                  _mm_setzero_pd (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_storeu_pd (void *__P, __mmask8 __U, __m256d __A)
+{
+  __builtin_ia32_storeupd256_mask ((double *) __P,
+                                  (__v4df) __A,
+                                  (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_storeu_pd (void *__P, __mmask8 __U, __m128d __A)
+{
+  __builtin_ia32_storeupd128_mask ((double *) __P,
+                                  (__v2df) __A,
+                                  (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_loadu_ps (__m256 __W, __mmask8 __U, void const *__P)
+{
+  return (__m256) __builtin_ia32_loadups256_mask ((const float *) __P,
+                                                 (__v8sf) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_loadu_ps (__mmask8 __U, void const *__P)
+{
+  return (__m256) __builtin_ia32_loadups256_mask ((const float *) __P,
+                                                 (__v8sf)
+                                                 _mm256_setzero_ps (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_loadu_ps (__m128 __W, __mmask8 __U, void const *__P)
+{
+  return (__m128) __builtin_ia32_loadups128_mask ((const float *) __P,
+                                                 (__v4sf) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_loadu_ps (__mmask8 __U, void const *__P)
+{
+  return (__m128) __builtin_ia32_loadups128_mask ((const float *) __P,
+                                                 (__v4sf)
+                                                 _mm_setzero_ps (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_storeu_ps (void *__P, __mmask8 __U, __m256 __A)
+{
+  __builtin_ia32_storeups256_mask ((float *) __P,
+                                  (__v8sf) __A,
+                                  (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_storeu_ps (void *__P, __mmask8 __U, __m128 __A)
+{
+  __builtin_ia32_storeups128_mask ((float *) __P,
+                                  (__v4sf) __A,
+                                  (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu_epi64 (void const *__P)
+{
+  return (__m256i) (*(__v4di_u *) __P);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_loadu_epi64 (__m256i __W, __mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddqudi256_mask ((const long long *) __P,
+                                                    (__v4di) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_loadu_epi64 (__mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddqudi256_mask ((const long long *) __P,
+                                                    (__v4di)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_epi64 (void const *__P)
+{
+  return (__m128i) (*(__v2di_u *) __P);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_loadu_epi64 (__m128i __W, __mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddqudi128_mask ((const long long *) __P,
+                                                    (__v2di) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_loadu_epi64 (__mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddqudi128_mask ((const long long *) __P,
+                                                    (__v2di)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu_epi64 (void *__P, __m256i __A)
+{
+  *(__m256i_u *) __P = (__m256i_u) __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_storeu_epi64 (void *__P, __mmask8 __U, __m256i __A)
+{
+  __builtin_ia32_storedqudi256_mask ((long long *) __P,
+                                    (__v4di) __A,
+                                    (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_epi64 (void *__P, __m128i __A)
+{
+  *(__m128i_u *) __P = (__m128i_u) __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_storeu_epi64 (void *__P, __mmask8 __U, __m128i __A)
+{
+  __builtin_ia32_storedqudi128_mask ((long long *) __P,
+                                    (__v2di) __A,
+                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu_epi32 (void const *__P)
+{
+  return (__m256i) (*(__v8si_u *) __P);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_loadu_epi32 (__m256i __W, __mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddqusi256_mask ((const int *) __P,
+                                                    (__v8si) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_loadu_epi32 (__mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddqusi256_mask ((const int *) __P,
+                                                    (__v8si)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_epi32 (void const *__P)
+{
+  return (__m128i) (*(__v4si_u *) __P);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_loadu_epi32 (__m128i __W, __mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddqusi128_mask ((const int *) __P,
+                                                    (__v4si) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_loadu_epi32 (__mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_loaddqusi128_mask ((const int *) __P,
+                                                    (__v4si)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu_epi32 (void *__P, __m256i __A)
+{
+  *(__m256i_u *) __P = (__m256i_u) __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_storeu_epi32 (void *__P, __mmask8 __U, __m256i __A)
+{
+  __builtin_ia32_storedqusi256_mask ((int *) __P,
+                                    (__v8si) __A,
+                                    (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_epi32 (void *__P, __m128i __A)
+{
+  *(__m128i_u *) __P = (__m128i_u) __A;
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_storeu_epi32 (void *__P, __mmask8 __U, __m128i __A)
+{
+  __builtin_ia32_storedqusi128_mask ((int *) __P,
+                                    (__v4si) __A,
+                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_abs_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_pabsd256_mask ((__v8si) __A,
+                                                (__v8si) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_abs_epi32 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_pabsd256_mask ((__v8si) __A,
+                                                (__v8si)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_abs_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pabsd128_mask ((__v4si) __A,
+                                                (__v4si) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_abs_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pabsd128_mask ((__v4si) __A,
+                                                (__v4si)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_abs_epi64 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A,
+                                                (__v4di)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_abs_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A,
+                                                (__v4di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_abs_epi64 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A,
+                                                (__v4di)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_epi64 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A,
+                                                (__v2di)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_abs_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A,
+                                                (__v2di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_abs_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A,
+                                                (__v2di)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtpd_epu32 (__m256d __A)
+{
+  return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
+                                                    (__v4si)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A)
+{
+  return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
+                                                    (__v4si) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtpd_epu32 (__mmask8 __U, __m256d __A)
+{
+  return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A,
+                                                    (__v4si)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpd_epu32 (__m128d __A)
+{
+  return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
+                                                    (__v4si)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A)
+{
+  return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
+                                                    (__v4si) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtpd_epu32 (__mmask8 __U, __m128d __A)
+{
+  return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A,
+                                                    (__v4si)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttps_epi32 (__m256i __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256i) __builtin_ia32_cvttps2dq256_mask ((__v8sf) __A,
+                                                    (__v8si) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttps_epi32 (__mmask8 __U, __m256 __A)
+{
+  return (__m256i) __builtin_ia32_cvttps2dq256_mask ((__v8sf) __A,
+                                                    (__v8si)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttps_epi32 (__m128i __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128i) __builtin_ia32_cvttps2dq128_mask ((__v4sf) __A,
+                                                    (__v4si) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttps_epi32 (__mmask8 __U, __m128 __A)
+{
+  return (__m128i) __builtin_ia32_cvttps2dq128_mask ((__v4sf) __A,
+                                                    (__v4si)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttps_epu32 (__m256 __A)
+{
+  return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
+                                                     (__v8si)
+                                                     _mm256_setzero_si256 (),
+                                                     (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttps_epu32 (__m256i __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
+                                                     (__v8si) __W,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttps_epu32 (__mmask8 __U, __m256 __A)
+{
+  return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A,
+                                                     (__v8si)
+                                                     _mm256_setzero_si256 (),
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttps_epu32 (__m128 __A)
+{
+  return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
+                                                     (__v4si)
+                                                     _mm_setzero_si128 (),
+                                                     (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttps_epu32 (__m128i __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
+                                                     (__v4si) __W,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttps_epu32 (__mmask8 __U, __m128 __A)
+{
+  return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A,
+                                                     (__v4si)
+                                                     _mm_setzero_si128 (),
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A)
+{
+  return (__m128i) __builtin_ia32_cvttpd2dq256_mask ((__v4df) __A,
+                                                    (__v4si) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttpd_epi32 (__mmask8 __U, __m256d __A)
+{
+  return (__m128i) __builtin_ia32_cvttpd2dq256_mask ((__v4df) __A,
+                                                    (__v4si)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A)
+{
+  return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A,
+                                                    (__v4si) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttpd_epi32 (__mmask8 __U, __m128d __A)
+{
+  return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A,
+                                                    (__v4si)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttpd_epu32 (__m256d __A)
+{
+  return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
+                                                     (__v4si)
+                                                     _mm_setzero_si128 (),
+                                                     (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A)
+{
+  return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
+                                                     (__v4si) __W,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvttpd_epu32 (__mmask8 __U, __m256d __A)
+{
+  return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A,
+                                                     (__v4si)
+                                                     _mm_setzero_si128 (),
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttpd_epu32 (__m128d __A)
+{
+  return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
+                                                     (__v4si)
+                                                     _mm_setzero_si128 (),
+                                                     (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A)
+{
+  return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
+                                                     (__v4si) __W,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvttpd_epu32 (__mmask8 __U, __m128d __A)
+{
+  return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A,
+                                                     (__v4si)
+                                                     _mm_setzero_si128 (),
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A)
+{
+  return (__m128i) __builtin_ia32_cvtpd2dq256_mask ((__v4df) __A,
+                                                   (__v4si) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtpd_epi32 (__mmask8 __U, __m256d __A)
+{
+  return (__m128i) __builtin_ia32_cvtpd2dq256_mask ((__v4df) __A,
+                                                   (__v4si)
+                                                   _mm_setzero_si128 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A)
+{
+  return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A,
+                                                   (__v4si) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtpd_epi32 (__mmask8 __U, __m128d __A)
+{
+  return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A,
+                                                   (__v4si)
+                                                   _mm_setzero_si128 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi32_pd (__m256d __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256d) __builtin_ia32_cvtdq2pd256_mask ((__v4si) __A,
+                                                   (__v4df) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A)
+{
+  return (__m256d) __builtin_ia32_cvtdq2pd256_mask ((__v4si) __A,
+                                                   (__v4df)
+                                                   _mm256_setzero_pd (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi32_pd (__m128d __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128d) __builtin_ia32_cvtdq2pd128_mask ((__v4si) __A,
+                                                   (__v2df) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A)
+{
+  return (__m128d) __builtin_ia32_cvtdq2pd128_mask ((__v4si) __A,
+                                                   (__v2df)
+                                                   _mm_setzero_pd (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu32_pd (__m128i __A)
+{
+  return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A,
+                                                    (__v4df)
+                                                    _mm256_setzero_pd (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepu32_pd (__m256d __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A,
+                                                    (__v4df) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A)
+{
+  return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A,
+                                                    (__v4df)
+                                                    _mm256_setzero_pd (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu32_pd (__m128i __A)
+{
+  return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A,
+                                                    (__v2df)
+                                                    _mm_setzero_pd (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepu32_pd (__m128d __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A,
+                                                    (__v2df) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A)
+{
+  return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A,
+                                                    (__v2df)
+                                                    _mm_setzero_pd (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi32_ps (__m256 __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256) __builtin_ia32_cvtdq2ps256_mask ((__v8si) __A,
+                                                  (__v8sf) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi32_ps (__mmask8 __U, __m256i __A)
+{
+  return (__m256) __builtin_ia32_cvtdq2ps256_mask ((__v8si) __A,
+                                                  (__v8sf)
+                                                  _mm256_setzero_ps (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi32_ps (__m128 __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128) __builtin_ia32_cvtdq2ps128_mask ((__v4si) __A,
+                                                  (__v4sf) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi32_ps (__mmask8 __U, __m128i __A)
+{
+  return (__m128) __builtin_ia32_cvtdq2ps128_mask ((__v4si) __A,
+                                                  (__v4sf)
+                                                  _mm_setzero_ps (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepu32_ps (__m256i __A)
+{
+  return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A,
+                                                   (__v8sf)
+                                                   _mm256_setzero_ps (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepu32_ps (__m256 __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A,
+                                                   (__v8sf) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepu32_ps (__mmask8 __U, __m256i __A)
+{
+  return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A,
+                                                   (__v8sf)
+                                                   _mm256_setzero_ps (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu32_ps (__m128i __A)
+{
+  return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A,
+                                                   (__v4sf)
+                                                   _mm_setzero_ps (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepu32_ps (__m128 __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A,
+                                                   (__v4sf) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepu32_ps (__mmask8 __U, __m128i __A)
+{
+  return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A,
+                                                   (__v4sf)
+                                                   _mm_setzero_ps (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtps_pd (__m256d __W, __mmask8 __U, __m128 __A)
+{
+  return (__m256d) __builtin_ia32_cvtps2pd256_mask ((__v4sf) __A,
+                                                   (__v4df) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtps_pd (__mmask8 __U, __m128 __A)
+{
+  return (__m256d) __builtin_ia32_cvtps2pd256_mask ((__v4sf) __A,
+                                                   (__v4df)
+                                                   _mm256_setzero_pd (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtps_pd (__m128d __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128d) __builtin_ia32_cvtps2pd128_mask ((__v4sf) __A,
+                                                   (__v2df) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtps_pd (__mmask8 __U, __m128 __A)
+{
+  return (__m128d) __builtin_ia32_cvtps2pd128_mask ((__v4sf) __A,
+                                                   (__v2df)
+                                                   _mm_setzero_pd (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi32_epi8 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A,
+                                                 (__v16qi)
+                                                 _mm_undefined_si128 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovdb128mem_mask ((unsigned int *) __P, (__v4si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A,
+                                                 (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi32_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A,
+                                                 (__v16qi)
+                                                 _mm_setzero_si128 (),
+                                                 __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi32_epi8 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A,
+                                                 (__v16qi)
+                                                 _mm_undefined_si128 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A,
+                                                 (__v16qi) __O, __M);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovdb256mem_mask ((unsigned long long *) __P, (__v8si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi32_epi8 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A,
+                                                 (__v16qi)
+                                                 _mm_setzero_si128 (),
+                                                 __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsepi32_epi8 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A,
+                                                  (__v16qi)
+                                                  _mm_undefined_si128 (),
+                                                  (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovsdb128mem_mask ((unsigned int *) __P, (__v4si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A,
+                                                  (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtsepi32_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A,
+                                                  (__v16qi)
+                                                  _mm_setzero_si128 (),
+                                                  __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtsepi32_epi8 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A,
+                                                  (__v16qi)
+                                                  _mm_undefined_si128 (),
+                                                  (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovsdb256mem_mask ((unsigned long long *) __P, (__v8si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A,
+                                                  (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtsepi32_epi8 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A,
+                                                  (__v16qi)
+                                                  _mm_setzero_si128 (),
+                                                  __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtusepi32_epi8 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A,
+                                                   (__v16qi)
+                                                   _mm_undefined_si128 (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovusdb128mem_mask ((unsigned int *) __P, (__v4si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A,
+                                                   (__v16qi) __O,
+                                                   __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtusepi32_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A,
+                                                   (__v16qi)
+                                                   _mm_setzero_si128 (),
+                                                   __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtusepi32_epi8 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A,
+                                                   (__v16qi)
+                                                   _mm_undefined_si128 (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovusdb256mem_mask ((unsigned long long *) __P, (__v8si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A,
+                                                   (__v16qi) __O,
+                                                   __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtusepi32_epi8 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A,
+                                                   (__v16qi)
+                                                   _mm_setzero_si128 (),
+                                                   __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi32_epi16 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A,
+                                                 (__v8hi)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovdw128mem_mask ((unsigned long long *) __P, (__v4si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A,
+                                                 (__v8hi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi32_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A,
+                                                 (__v8hi)
+                                                 _mm_setzero_si128 (),
+                                                 __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi32_epi16 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A,
+                                                 (__v8hi)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi32_storeu_epi16 (void *  __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A,
+                                                 (__v8hi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi32_epi16 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A,
+                                                 (__v8hi)
+                                                 _mm_setzero_si128 (),
+                                                 __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsepi32_epi16 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A,
+                                                  (__v8hi)
+                                                  _mm_setzero_si128 (),
+                                                  (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovsdw128mem_mask ((unsigned long long *) __P, (__v4si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A,
+                                                  (__v8hi)__O,
+                                                  __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtsepi32_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A,
+                                                  (__v8hi)
+                                                  _mm_setzero_si128 (),
+                                                  __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtsepi32_epi16 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A,
+                                                  (__v8hi)
+                                                  _mm_undefined_si128 (),
+                                                  (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovsdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A,
+                                                  (__v8hi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtsepi32_epi16 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A,
+                                                  (__v8hi)
+                                                  _mm_setzero_si128 (),
+                                                  __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtusepi32_epi16 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A,
+                                                   (__v8hi)
+                                                   _mm_undefined_si128 (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovusdw128mem_mask ((unsigned long long *) __P, (__v4si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A,
+                                                   (__v8hi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtusepi32_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A,
+                                                   (__v8hi)
+                                                   _mm_setzero_si128 (),
+                                                   __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtusepi32_epi16 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A,
+                                                   (__v8hi)
+                                                   _mm_undefined_si128 (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovusdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A,
+                                                   (__v8hi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtusepi32_epi16 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A,
+                                                   (__v8hi)
+                                                   _mm_setzero_si128 (),
+                                                   __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi64_epi8 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A,
+                                                 (__v16qi)
+                                                 _mm_undefined_si128 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovqb128mem_mask ((unsigned short *) __P, (__v2di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A,
+                                                 (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi64_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A,
+                                                 (__v16qi)
+                                                 _mm_setzero_si128 (),
+                                                 __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi64_epi8 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A,
+                                                 (__v16qi)
+                                                 _mm_undefined_si128 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovqb256mem_mask ((unsigned int *) __P, (__v4di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A,
+                                                 (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi64_epi8 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A,
+                                                 (__v16qi)
+                                                 _mm_setzero_si128 (),
+                                                 __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsepi64_epi8 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A,
+                                                  (__v16qi)
+                                                  _mm_undefined_si128 (),
+                                                  (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovsqb128mem_mask ((unsigned short *) __P, (__v2di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A,
+                                                  (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtsepi64_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A,
+                                                  (__v16qi)
+                                                  _mm_setzero_si128 (),
+                                                  __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtsepi64_epi8 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A,
+                                                  (__v16qi)
+                                                  _mm_undefined_si128 (),
+                                                  (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovsqb256mem_mask ((unsigned int *) __P, (__v4di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A,
+                                                  (__v16qi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtsepi64_epi8 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A,
+                                                  (__v16qi)
+                                                  _mm_setzero_si128 (),
+                                                  __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtusepi64_epi8 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A,
+                                                   (__v16qi)
+                                                   _mm_undefined_si128 (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovusqb128mem_mask ((unsigned short *) __P, (__v2di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A,
+                                                   (__v16qi) __O,
+                                                   __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtusepi64_epi8 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A,
+                                                   (__v16qi)
+                                                   _mm_setzero_si128 (),
+                                                   __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtusepi64_epi8 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A,
+                                                   (__v16qi)
+                                                   _mm_undefined_si128 (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovusqb256mem_mask ((unsigned int *) __P, (__v4di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A,
+                                                   (__v16qi) __O,
+                                                   __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtusepi64_epi8 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A,
+                                                   (__v16qi)
+                                                   _mm_setzero_si128 (),
+                                                   __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi64_epi16 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A,
+                                                 (__v8hi)
+                                                 _mm_undefined_si128 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovqw128mem_mask ((unsigned int *) __P, (__v2di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A,
+                                                 (__v8hi)__O,
+                                                 __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi64_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A,
+                                                 (__v8hi)
+                                                 _mm_setzero_si128 (),
+                                                 __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi64_epi16 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A,
+                                                 (__v8hi)
+                                                 _mm_undefined_si128 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovqw256mem_mask ((unsigned long long *) __P, (__v4di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A,
+                                                 (__v8hi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi64_epi16 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A,
+                                                 (__v8hi)
+                                                 _mm_setzero_si128 (),
+                                                 __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsepi64_epi16 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A,
+                                                  (__v8hi)
+                                                  _mm_undefined_si128 (),
+                                                  (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovsqw128mem_mask ((unsigned int *) __P, (__v2di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A,
+                                                  (__v8hi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtsepi64_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A,
+                                                  (__v8hi)
+                                                  _mm_setzero_si128 (),
+                                                  __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtsepi64_epi16 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A,
+                                                  (__v8hi)
+                                                  _mm_undefined_si128 (),
+                                                  (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovsqw256mem_mask ((unsigned long long *) __P, (__v4di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A,
+                                                  (__v8hi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtsepi64_epi16 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A,
+                                                  (__v8hi)
+                                                  _mm_setzero_si128 (),
+                                                  __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtusepi64_epi16 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A,
+                                                   (__v8hi)
+                                                   _mm_undefined_si128 (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovusqw128mem_mask ((unsigned int *) __P, (__v2di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A,
+                                                   (__v8hi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtusepi64_epi16 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A,
+                                                   (__v8hi)
+                                                   _mm_setzero_si128 (),
+                                                   __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtusepi64_epi16 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A,
+                                                   (__v8hi)
+                                                   _mm_undefined_si128 (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovusqw256mem_mask ((unsigned long long *) __P, (__v4di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A,
+                                                   (__v8hi) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtusepi64_epi16 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A,
+                                                   (__v8hi)
+                                                   _mm_setzero_si128 (),
+                                                   __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi64_epi32 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A,
+                                                 (__v4si)
+                                                 _mm_undefined_si128 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovqd128mem_mask ((unsigned long long *) __P,
+                                   (__v2di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A,
+                                                 (__v4si) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi64_epi32 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A,
+                                                 (__v4si)
+                                                 _mm_setzero_si128 (),
+                                                 __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi64_epi32 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A,
+                                                 (__v4si)
+                                                 _mm_undefined_si128 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A,
+                                                 (__v4si) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi64_epi32 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A,
+                                                 (__v4si)
+                                                 _mm_setzero_si128 (),
+                                                 __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsepi64_epi32 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A,
+                                                  (__v4si)
+                                                  _mm_undefined_si128 (),
+                                                  (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovsqd128mem_mask ((unsigned long long *) __P, (__v2di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A,
+                                                  (__v4si) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtsepi64_epi32 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A,
+                                                  (__v4si)
+                                                  _mm_setzero_si128 (),
+                                                  __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtsepi64_epi32 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A,
+                                                  (__v4si)
+                                                  _mm_undefined_si128 (),
+                                                  (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovsqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A,
+                                                  (__v4si)__O,
+                                                  __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtsepi64_epi32 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A,
+                                                  (__v4si)
+                                                  _mm_setzero_si128 (),
+                                                  __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtusepi64_epi32 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A,
+                                                   (__v4si)
+                                                   _mm_undefined_si128 (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A)
+{
+  __builtin_ia32_pmovusqd128mem_mask ((unsigned long long *) __P, (__v2di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A,
+                                                   (__v4si) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtusepi64_epi32 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A,
+                                                   (__v4si)
+                                                   _mm_setzero_si128 (),
+                                                   __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtusepi64_epi32 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A,
+                                                   (__v4si)
+                                                   _mm_undefined_si128 (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A)
+{
+  __builtin_ia32_pmovusqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A,
+                                                   (__v4si) __O, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtusepi64_epi32 (__mmask8 __M, __m256i __A)
+{
+  return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A,
+                                                   (__v4si)
+                                                   _mm_setzero_si128 (),
+                                                   __M);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_broadcastss_ps (__m256 __O, __mmask8 __M, __m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastss256_mask ((__v4sf) __A,
+                                                     (__v8sf) __O,
+                                                     __M);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_broadcastss_ps (__mmask8 __M, __m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastss256_mask ((__v4sf) __A,
+                                                     (__v8sf)
+                                                     _mm256_setzero_ps (),
+                                                     __M);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_broadcastss_ps (__m128 __O, __mmask8 __M, __m128 __A)
+{
+  return (__m128) __builtin_ia32_broadcastss128_mask ((__v4sf) __A,
+                                                     (__v4sf) __O,
+                                                     __M);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_broadcastss_ps (__mmask8 __M, __m128 __A)
+{
+  return (__m128) __builtin_ia32_broadcastss128_mask ((__v4sf) __A,
+                                                     (__v4sf)
+                                                     _mm_setzero_ps (),
+                                                     __M);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_broadcastsd_pd (__m256d __O, __mmask8 __M, __m128d __A)
+{
+  return (__m256d) __builtin_ia32_broadcastsd256_mask ((__v2df) __A,
+                                                      (__v4df) __O,
+                                                      __M);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
+{
+  return (__m256d) __builtin_ia32_broadcastsd256_mask ((__v2df) __A,
+                                                      (__v4df)
+                                                      _mm256_setzero_pd (),
+                                                      __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_broadcastd_epi32 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastd256_mask ((__v4si) __A,
+                                                      (__v8si) __O,
+                                                      __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastd256_mask ((__v4si) __A,
+                                                      (__v8si)
+                                                      _mm256_setzero_si256 (),
+                                                      __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_set1_epi32 (__m256i __O, __mmask8 __M, int __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastd256_gpr_mask (__A, (__v8si) __O,
+                                                          __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_set1_epi32 (__mmask8 __M, int __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastd256_gpr_mask (__A,
+                                                          (__v8si)
+                                                          _mm256_setzero_si256 (),
+                                                          __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_broadcastd_epi32 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastd128_mask ((__v4si) __A,
+                                                      (__v4si) __O,
+                                                      __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastd128_mask ((__v4si) __A,
+                                                      (__v4si)
+                                                      _mm_setzero_si128 (),
+                                                      __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_set1_epi32 (__m128i __O, __mmask8 __M, int __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastd128_gpr_mask (__A, (__v4si) __O,
+                                                          __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_set1_epi32 (__mmask8 __M, int __A)
+{
+  return (__m128i)
+        __builtin_ia32_pbroadcastd128_gpr_mask (__A,
+                                                (__v4si) _mm_setzero_si128 (),
+                                                __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_broadcastq_epi64 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastq256_mask ((__v2di) __A,
+                                                      (__v4di) __O,
+                                                      __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastq256_mask ((__v2di) __A,
+                                                      (__v4di)
+                                                      _mm256_setzero_si256 (),
+                                                      __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_set1_epi64 (__m256i __O, __mmask8 __M, long long __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastq256_gpr_mask (__A, (__v4di) __O,
+                                                          __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_set1_epi64 (__mmask8 __M, long long __A)
+{
+  return (__m256i) __builtin_ia32_pbroadcastq256_gpr_mask (__A,
+                                                          (__v4di)
+                                                          _mm256_setzero_si256 (),
+                                                          __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_broadcastq_epi64 (__m128i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastq128_mask ((__v2di) __A,
+                                                      (__v2di) __O,
+                                                      __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastq128_mask ((__v2di) __A,
+                                                      (__v2di)
+                                                      _mm_setzero_si128 (),
+                                                      __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A)
+{
+  return (__m128i) __builtin_ia32_pbroadcastq128_gpr_mask (__A, (__v2di) __O,
+                                                          __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_set1_epi64 (__mmask8 __M, long long __A)
+{
+  return (__m128i)
+        __builtin_ia32_pbroadcastq128_gpr_mask (__A,
+                                                (__v2di) _mm_setzero_si128 (),
+                                                __M);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcast_f32x4 (__m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,
+                                                         (__v8sf)_mm256_undefined_pd (),
+                                                         (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_broadcast_f32x4 (__m256 __O, __mmask8 __M, __m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,
+                                                         (__v8sf) __O,
+                                                         __M);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_broadcast_f32x4 (__mmask8 __M, __m128 __A)
+{
+  return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A,
+                                                         (__v8sf)
+                                                         _mm256_setzero_ps (),
+                                                         __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcast_i32x4 (__m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si)
+                                                          __A,
+                                                          (__v8si)_mm256_undefined_si256 (),
+                                                          (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_broadcast_i32x4 (__m256i __O, __mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si)
+                                                          __A,
+                                                          (__v8si)
+                                                          __O, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_broadcast_i32x4 (__mmask8 __M, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si)
+                                                          __A,
+                                                          (__v8si)
+                                                          _mm256_setzero_si256 (),
+                                                          __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi8_epi32 (__m256i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsxbd256_mask ((__v16qi) __A,
+                                                   (__v8si) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsxbd256_mask ((__v16qi) __A,
+                                                   (__v8si)
+                                                   _mm256_setzero_si256 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi8_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsxbd128_mask ((__v16qi) __A,
+                                                   (__v4si) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsxbd128_mask ((__v16qi) __A,
+                                                   (__v4si)
+                                                   _mm_setzero_si128 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi8_epi64 (__m256i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsxbq256_mask ((__v16qi) __A,
+                                                   (__v4di) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsxbq256_mask ((__v16qi) __A,
+                                                   (__v4di)
+                                                   _mm256_setzero_si256 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi8_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsxbq128_mask ((__v16qi) __A,
+                                                   (__v2di) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsxbq128_mask ((__v16qi) __A,
+                                                   (__v2di)
+                                                   _mm_setzero_si128 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi16_epi32 (__m256i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsxwd256_mask ((__v8hi) __A,
+                                                   (__v8si) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsxwd256_mask ((__v8hi) __A,
+                                                   (__v8si)
+                                                   _mm256_setzero_si256 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi16_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsxwd128_mask ((__v8hi) __A,
+                                                   (__v4si) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsxwd128_mask ((__v8hi) __A,
+                                                   (__v4si)
+                                                   _mm_setzero_si128 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi16_epi64 (__m256i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsxwq256_mask ((__v8hi) __A,
+                                                   (__v4di) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovsxwq256_mask ((__v8hi) __A,
+                                                   (__v4di)
+                                                   _mm256_setzero_si256 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi16_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsxwq128_mask ((__v8hi) __A,
+                                                   (__v2di) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovsxwq128_mask ((__v8hi) __A,
+                                                   (__v2di)
+                                                   _mm_setzero_si128 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepi32_epi64 (__m256i __W, __mmask8 __U, __m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovsxdq256_mask ((__v4si) __X,
+                                                   (__v4di) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepi32_epi64 (__mmask8 __U, __m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovsxdq256_mask ((__v4si) __X,
+                                                   (__v4di)
+                                                   _mm256_setzero_si256 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepi32_epi64 (__m128i __W, __mmask8 __U, __m128i __X)
+{
+  return (__m128i) __builtin_ia32_pmovsxdq128_mask ((__v4si) __X,
+                                                   (__v2di) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepi32_epi64 (__mmask8 __U, __m128i __X)
+{
+  return (__m128i) __builtin_ia32_pmovsxdq128_mask ((__v4si) __X,
+                                                   (__v2di)
+                                                   _mm_setzero_si128 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepu8_epi32 (__m256i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovzxbd256_mask ((__v16qi) __A,
+                                                   (__v8si) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepu8_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovzxbd256_mask ((__v16qi) __A,
+                                                   (__v8si)
+                                                   _mm256_setzero_si256 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepu8_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovzxbd128_mask ((__v16qi) __A,
+                                                   (__v4si) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepu8_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovzxbd128_mask ((__v16qi) __A,
+                                                   (__v4si)
+                                                   _mm_setzero_si128 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepu8_epi64 (__m256i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovzxbq256_mask ((__v16qi) __A,
+                                                   (__v4di) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovzxbq256_mask ((__v16qi) __A,
+                                                   (__v4di)
+                                                   _mm256_setzero_si256 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepu8_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovzxbq128_mask ((__v16qi) __A,
+                                                   (__v2di) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovzxbq128_mask ((__v16qi) __A,
+                                                   (__v2di)
+                                                   _mm_setzero_si128 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepu16_epi32 (__m256i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovzxwd256_mask ((__v8hi) __A,
+                                                   (__v8si) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepu16_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovzxwd256_mask ((__v8hi) __A,
+                                                   (__v8si)
+                                                   _mm256_setzero_si256 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepu16_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovzxwd128_mask ((__v8hi) __A,
+                                                   (__v4si) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepu16_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovzxwd128_mask ((__v8hi) __A,
+                                                   (__v4si)
+                                                   _mm_setzero_si128 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepu16_epi64 (__m256i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovzxwq256_mask ((__v8hi) __A,
+                                                   (__v4di) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m256i) __builtin_ia32_pmovzxwq256_mask ((__v8hi) __A,
+                                                   (__v4di)
+                                                   _mm256_setzero_si256 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepu16_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovzxwq128_mask ((__v8hi) __A,
+                                                   (__v2di) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_pmovzxwq128_mask ((__v8hi) __A,
+                                                   (__v2di)
+                                                   _mm_setzero_si128 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtepu32_epi64 (__m256i __W, __mmask8 __U, __m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovzxdq256_mask ((__v4si) __X,
+                                                   (__v4di) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtepu32_epi64 (__mmask8 __U, __m128i __X)
+{
+  return (__m256i) __builtin_ia32_pmovzxdq256_mask ((__v4si) __X,
+                                                   (__v4di)
+                                                   _mm256_setzero_si256 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtepu32_epi64 (__m128i __W, __mmask8 __U, __m128i __X)
+{
+  return (__m128i) __builtin_ia32_pmovzxdq128_mask ((__v4si) __X,
+                                                   (__v2di) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtepu32_epi64 (__mmask8 __U, __m128i __X)
+{
+  return (__m128i) __builtin_ia32_pmovzxdq128_mask ((__v4si) __X,
+                                                   (__v2di)
+                                                   _mm_setzero_si128 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rcp14_pd (__m256d __A)
+{
+  return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A,
+                                             (__v4df)
+                                             _mm256_setzero_pd (),
+                                             (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_rcp14_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A,
+                                             (__v4df) __W,
+                                             (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_rcp14_pd (__mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A,
+                                             (__v4df)
+                                             _mm256_setzero_pd (),
+                                             (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp14_pd (__m128d __A)
+{
+  return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A,
+                                             (__v2df)
+                                             _mm_setzero_pd (),
+                                             (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rcp14_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A,
+                                             (__v2df) __W,
+                                             (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rcp14_pd (__mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A,
+                                             (__v2df)
+                                             _mm_setzero_pd (),
+                                             (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rcp14_ps (__m256 __A)
+{
+  return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A,
+                                            (__v8sf)
+                                            _mm256_setzero_ps (),
+                                            (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_rcp14_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A,
+                                            (__v8sf) __W,
+                                            (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_rcp14_ps (__mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A,
+                                            (__v8sf)
+                                            _mm256_setzero_ps (),
+                                            (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp14_ps (__m128 __A)
+{
+  return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A,
+                                            (__v4sf)
+                                            _mm_setzero_ps (),
+                                            (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rcp14_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A,
+                                            (__v4sf) __W,
+                                            (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rcp14_ps (__mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A,
+                                            (__v4sf)
+                                            _mm_setzero_ps (),
+                                            (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rsqrt14_pd (__m256d __A)
+{
+  return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A,
+                                                    (__v4df)
+                                                    _mm256_setzero_pd (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_rsqrt14_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A,
+                                                    (__v4df) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_rsqrt14_pd (__mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A,
+                                                    (__v4df)
+                                                    _mm256_setzero_pd (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt14_pd (__m128d __A)
+{
+  return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A,
+                                                    (__v2df)
+                                                    _mm_setzero_pd (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rsqrt14_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A,
+                                                    (__v2df) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rsqrt14_pd (__mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A,
+                                                    (__v2df)
+                                                    _mm_setzero_pd (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rsqrt14_ps (__m256 __A)
+{
+  return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A,
+                                                   (__v8sf)
+                                                   _mm256_setzero_ps (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_rsqrt14_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A,
+                                                   (__v8sf) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_rsqrt14_ps (__mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A,
+                                                   (__v8sf)
+                                                   _mm256_setzero_ps (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt14_ps (__m128 __A)
+{
+  return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A,
+                                                   (__v4sf)
+                                                   _mm_setzero_ps (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rsqrt14_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A,
+                                                   (__v4sf) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rsqrt14_ps (__mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A,
+                                                   (__v4sf)
+                                                   _mm_setzero_ps (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sqrt_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_sqrtpd256_mask ((__v4df) __A,
+                                                 (__v4df) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sqrt_pd (__mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_sqrtpd256_mask ((__v4df) __A,
+                                                 (__v4df)
+                                                 _mm256_setzero_pd (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sqrt_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_sqrtpd128_mask ((__v2df) __A,
+                                                 (__v2df) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sqrt_pd (__mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_sqrtpd128_mask ((__v2df) __A,
+                                                 (__v2df)
+                                                 _mm_setzero_pd (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sqrt_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_sqrtps256_mask ((__v8sf) __A,
+                                                (__v8sf) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sqrt_ps (__mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_sqrtps256_mask ((__v8sf) __A,
+                                                (__v8sf)
+                                                _mm256_setzero_ps (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sqrt_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_sqrtps128_mask ((__v4sf) __A,
+                                                (__v4sf) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sqrt_ps (__mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_sqrtps128_mask ((__v4sf) __A,
+                                                (__v4sf)
+                                                _mm_setzero_ps (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_add_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+                      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddd256_mask ((__v8si) __A,
+                                                (__v8si) __B,
+                                                (__v8si) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_add_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddd256_mask ((__v8si) __A,
+                                                (__v8si) __B,
+                                                (__v8si)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_add_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+                      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddq256_mask ((__v4di) __A,
+                                                (__v4di) __B,
+                                                (__v4di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_add_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_paddq256_mask ((__v4di) __A,
+                                                (__v4di) __B,
+                                                (__v4di)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sub_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+                      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubd256_mask ((__v8si) __A,
+                                                (__v8si) __B,
+                                                (__v8si) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sub_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubd256_mask ((__v8si) __A,
+                                                (__v8si) __B,
+                                                (__v8si)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sub_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+                      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubq256_mask ((__v4di) __A,
+                                                (__v4di) __B,
+                                                (__v4di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sub_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_psubq256_mask ((__v4di) __A,
+                                                (__v4di) __B,
+                                                (__v4di)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_add_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddd128_mask ((__v4si) __A,
+                                                (__v4si) __B,
+                                                (__v4si) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_add_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddd128_mask ((__v4si) __A,
+                                                (__v4si) __B,
+                                                (__v4si)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_add_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddq128_mask ((__v2di) __A,
+                                                (__v2di) __B,
+                                                (__v2di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_add_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_paddq128_mask ((__v2di) __A,
+                                                (__v2di) __B,
+                                                (__v2di)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sub_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubd128_mask ((__v4si) __A,
+                                                (__v4si) __B,
+                                                (__v4si) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sub_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubd128_mask ((__v4si) __A,
+                                                (__v4si) __B,
+                                                (__v4si)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sub_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubq128_mask ((__v2di) __A,
+                                                (__v2di) __B,
+                                                (__v2di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sub_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psubq128_mask ((__v2di) __A,
+                                                (__v2di) __B,
+                                                (__v2di)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_getexp_ps (__m256 __A)
+{
+  return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
+                                                  (__v8sf)
+                                                  _mm256_setzero_ps (),
+                                                  (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_getexp_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
+                                                  (__v8sf) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_getexp_ps (__mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A,
+                                                  (__v8sf)
+                                                  _mm256_setzero_ps (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_getexp_pd (__m256d __A)
+{
+  return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
+                                                   (__v4df)
+                                                   _mm256_setzero_pd (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_getexp_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
+                                                   (__v4df) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_getexp_pd (__mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A,
+                                                   (__v4df)
+                                                   _mm256_setzero_pd (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getexp_ps (__m128 __A)
+{
+  return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
+                                                  (__v4sf)
+                                                  _mm_setzero_ps (),
+                                                  (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getexp_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
+                                                  (__v4sf) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getexp_ps (__mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A,
+                                                  (__v4sf)
+                                                  _mm_setzero_ps (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getexp_pd (__m128d __A)
+{
+  return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
+                                                   (__v2df)
+                                                   _mm_setzero_pd (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getexp_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
+                                                   (__v2df) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getexp_pd (__mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A,
+                                                   (__v2df)
+                                                   _mm_setzero_pd (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_srl_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+                      __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psrld256_mask ((__v8si) __A,
+                                                (__v4si) __B,
+                                                (__v8si) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_srl_epi32 (__mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psrld256_mask ((__v8si) __A,
+                                                (__v4si) __B,
+                                                (__v8si)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_srl_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psrld128_mask ((__v4si) __A,
+                                                (__v4si) __B,
+                                                (__v4si) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_srl_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psrld128_mask ((__v4si) __A,
+                                                (__v4si) __B,
+                                                (__v4si)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_srl_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+                      __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psrlq256_mask ((__v4di) __A,
+                                                (__v2di) __B,
+                                                (__v4di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_srl_epi64 (__mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psrlq256_mask ((__v4di) __A,
+                                                (__v2di) __B,
+                                                (__v4di)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_srl_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psrlq128_mask ((__v2di) __A,
+                                                (__v2di) __B,
+                                                (__v2di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_srl_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psrlq128_mask ((__v2di) __A,
+                                                (__v2di) __B,
+                                                (__v2di)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_and_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+                      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pandd256_mask ((__v8si) __A,
+                                                (__v8si) __B,
+                                                (__v8si) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_and_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pandd256_mask ((__v8si) __A,
+                                                (__v8si) __B,
+                                                (__v8si)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_scalef_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
+                                                   (__v4df) __B,
+                                                   (__v4df)
+                                                   _mm256_setzero_pd (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_scalef_pd (__m256d __W, __mmask8 __U, __m256d __A,
+                      __m256d __B)
+{
+  return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
+                                                   (__v4df) __B,
+                                                   (__v4df) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_scalef_pd (__mmask8 __U, __m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A,
+                                                   (__v4df) __B,
+                                                   (__v4df)
+                                                   _mm256_setzero_pd (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_scalef_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
+                                                  (__v8sf) __B,
+                                                  (__v8sf)
+                                                  _mm256_setzero_ps (),
+                                                  (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_scalef_ps (__m256 __W, __mmask8 __U, __m256 __A,
+                      __m256 __B)
+{
+  return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
+                                                  (__v8sf) __B,
+                                                  (__v8sf) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A,
+                                                  (__v8sf) __B,
+                                                  (__v8sf)
+                                                  _mm256_setzero_ps (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_scalef_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
+                                                   (__v2df) __B,
+                                                   (__v2df)
+                                                   _mm_setzero_pd (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_scalef_pd (__m128d __W, __mmask8 __U, __m128d __A,
+                   __m128d __B)
+{
+  return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
+                                                   (__v2df) __B,
+                                                   (__v2df) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_scalef_pd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A,
+                                                   (__v2df) __B,
+                                                   (__v2df)
+                                                   _mm_setzero_pd (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_scalef_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
+                                                  (__v4sf) __B,
+                                                  (__v4sf)
+                                                  _mm_setzero_ps (),
+                                                  (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_scalef_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
+                                                  (__v4sf) __B,
+                                                  (__v4sf) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_scalef_ps (__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A,
+                                                  (__v4sf) __B,
+                                                  (__v4sf)
+                                                  _mm_setzero_ps (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmadd_pd (__m256d __A, __mmask8 __U, __m256d __B,
+                     __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_mask ((__v4df) __A,
+                                                   (__v4df) __B,
+                                                   (__v4df) __C,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmadd_pd (__m256d __A, __m256d __B, __m256d __C,
+                      __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_mask3 ((__v4df) __A,
+                                                    (__v4df) __B,
+                                                    (__v4df) __C,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmadd_pd (__mmask8 __U, __m256d __A, __m256d __B,
+                      __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256_maskz ((__v4df) __A,
+                                                    (__v4df) __B,
+                                                    (__v4df) __C,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmadd_pd (__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_mask ((__v2df) __A,
+                                                   (__v2df) __B,
+                                                   (__v2df) __C,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmadd_pd (__m128d __A, __m128d __B, __m128d __C,
+                   __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_mask3 ((__v2df) __A,
+                                                    (__v2df) __B,
+                                                    (__v2df) __C,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmadd_pd (__mmask8 __U, __m128d __A, __m128d __B,
+                   __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd128_maskz ((__v2df) __A,
+                                                    (__v2df) __B,
+                                                    (__v2df) __C,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmadd_ps (__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_mask ((__v8sf) __A,
+                                                  (__v8sf) __B,
+                                                  (__v8sf) __C,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmadd_ps (__m256 __A, __m256 __B, __m256 __C,
+                      __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_mask3 ((__v8sf) __A,
+                                                   (__v8sf) __B,
+                                                   (__v8sf) __C,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmadd_ps (__mmask8 __U, __m256 __A, __m256 __B,
+                      __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256_maskz ((__v8sf) __A,
+                                                   (__v8sf) __B,
+                                                   (__v8sf) __C,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmadd_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_mask ((__v4sf) __A,
+                                                  (__v4sf) __B,
+                                                  (__v4sf) __C,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmadd_ps (__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_mask3 ((__v4sf) __A,
+                                                   (__v4sf) __B,
+                                                   (__v4sf) __C,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmadd_ps (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps128_maskz ((__v4sf) __A,
+                                                   (__v4sf) __B,
+                                                   (__v4sf) __C,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmsub_pd (__m256d __A, __mmask8 __U, __m256d __B,
+                     __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmsubpd256_mask ((__v4df) __A,
+                                                   (__v4df) __B,
+                                                   (__v4df) __C,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmsub_pd (__m256d __A, __m256d __B, __m256d __C,
+                      __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfmsubpd256_mask3 ((__v4df) __A,
+                                                    (__v4df) __B,
+                                                    (__v4df) __C,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmsub_pd (__mmask8 __U, __m256d __A, __m256d __B,
+                      __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmsubpd256_maskz ((__v4df) __A,
+                                                    (__v4df) __B,
+                                                    (__v4df) __C,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmsub_pd (__m128d __A, __mmask8 __U, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmsubpd128_mask ((__v2df) __A,
+                                                   (__v2df) __B,
+                                                   (__v2df) __C,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmsub_pd (__m128d __A, __m128d __B, __m128d __C,
+                   __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmsubpd128_mask3 ((__v2df) __A,
+                                                    (__v2df) __B,
+                                                    (__v2df) __C,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmsub_pd (__mmask8 __U, __m128d __A, __m128d __B,
+                   __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmsubpd128_maskz ((__v2df) __A,
+                                                    (__v2df) __B,
+                                                    (__v2df) __C,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmsub_ps (__m256 __A, __mmask8 __U, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmsubps256_mask ((__v8sf) __A,
+                                                  (__v8sf) __B,
+                                                  (__v8sf) __C,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmsub_ps (__m256 __A, __m256 __B, __m256 __C,
+                      __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfmsubps256_mask3 ((__v8sf) __A,
+                                                   (__v8sf) __B,
+                                                   (__v8sf) __C,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmsub_ps (__mmask8 __U, __m256 __A, __m256 __B,
+                      __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmsubps256_maskz ((__v8sf) __A,
+                                                   (__v8sf) __B,
+                                                   (__v8sf) __C,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmsub_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmsubps128_mask ((__v4sf) __A,
+                                                  (__v4sf) __B,
+                                                  (__v4sf) __C,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmsub_ps (__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmsubps128_mask3 ((__v4sf) __A,
+                                                   (__v4sf) __B,
+                                                   (__v4sf) __C,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmsub_ps (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmsubps128_maskz ((__v4sf) __A,
+                                                   (__v4sf) __B,
+                                                   (__v4sf) __C,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmaddsub_pd (__m256d __A, __mmask8 __U, __m256d __B,
+                        __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_mask ((__v4df) __A,
+                                                      (__v4df) __B,
+                                                      (__v4df) __C,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmaddsub_pd (__m256d __A, __m256d __B, __m256d __C,
+                         __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_mask3 ((__v4df) __A,
+                                                       (__v4df) __B,
+                                                       (__v4df) __C,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmaddsub_pd (__mmask8 __U, __m256d __A, __m256d __B,
+                         __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_maskz ((__v4df) __A,
+                                                       (__v4df) __B,
+                                                       (__v4df) __C,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmaddsub_pd (__m128d __A, __mmask8 __U, __m128d __B,
+                     __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd128_mask ((__v2df) __A,
+                                                      (__v2df) __B,
+                                                      (__v2df) __C,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmaddsub_pd (__m128d __A, __m128d __B, __m128d __C,
+                      __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd128_mask3 ((__v2df) __A,
+                                                       (__v2df) __B,
+                                                       (__v2df) __C,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmaddsub_pd (__mmask8 __U, __m128d __A, __m128d __B,
+                      __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd128_maskz ((__v2df) __A,
+                                                       (__v2df) __B,
+                                                       (__v2df) __C,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmaddsub_ps (__m256 __A, __mmask8 __U, __m256 __B,
+                        __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256_mask ((__v8sf) __A,
+                                                     (__v8sf) __B,
+                                                     (__v8sf) __C,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmaddsub_ps (__m256 __A, __m256 __B, __m256 __C,
+                         __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256_mask3 ((__v8sf) __A,
+                                                      (__v8sf) __B,
+                                                      (__v8sf) __C,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmaddsub_ps (__mmask8 __U, __m256 __A, __m256 __B,
+                         __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256_maskz ((__v8sf) __A,
+                                                      (__v8sf) __B,
+                                                      (__v8sf) __C,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmaddsub_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps128_mask ((__v4sf) __A,
+                                                     (__v4sf) __B,
+                                                     (__v4sf) __C,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmaddsub_ps (__m128 __A, __m128 __B, __m128 __C,
+                      __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps128_mask3 ((__v4sf) __A,
+                                                      (__v4sf) __B,
+                                                      (__v4sf) __C,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmaddsub_ps (__mmask8 __U, __m128 __A, __m128 __B,
+                      __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps128_maskz ((__v4sf) __A,
+                                                      (__v4sf) __B,
+                                                      (__v4sf) __C,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmsubadd_pd (__m256d __A, __mmask8 __U, __m256d __B,
+                        __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_mask ((__v4df) __A,
+                                                      (__v4df) __B,
+                                                      -(__v4df) __C,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmsubadd_pd (__m256d __A, __m256d __B, __m256d __C,
+                         __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfmsubaddpd256_mask3 ((__v4df) __A,
+                                                       (__v4df) __B,
+                                                       (__v4df) __C,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmsubadd_pd (__mmask8 __U, __m256d __A, __m256d __B,
+                         __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256_maskz ((__v4df) __A,
+                                                       (__v4df) __B,
+                                                       -(__v4df) __C,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmsubadd_pd (__m128d __A, __mmask8 __U, __m128d __B,
+                     __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd128_mask ((__v2df) __A,
+                                                      (__v2df) __B,
+                                                      -(__v2df) __C,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmsubadd_pd (__m128d __A, __m128d __B, __m128d __C,
+                      __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfmsubaddpd128_mask3 ((__v2df) __A,
+                                                       (__v2df) __B,
+                                                       (__v2df) __C,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmsubadd_pd (__mmask8 __U, __m128d __A, __m128d __B,
+                      __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd128_maskz ((__v2df) __A,
+                                                       (__v2df) __B,
+                                                       -(__v2df) __C,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fmsubadd_ps (__m256 __A, __mmask8 __U, __m256 __B,
+                        __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256_mask ((__v8sf) __A,
+                                                     (__v8sf) __B,
+                                                     -(__v8sf) __C,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fmsubadd_ps (__m256 __A, __m256 __B, __m256 __C,
+                         __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfmsubaddps256_mask3 ((__v8sf) __A,
+                                                      (__v8sf) __B,
+                                                      (__v8sf) __C,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fmsubadd_ps (__mmask8 __U, __m256 __A, __m256 __B,
+                         __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256_maskz ((__v8sf) __A,
+                                                      (__v8sf) __B,
+                                                      -(__v8sf) __C,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fmsubadd_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps128_mask ((__v4sf) __A,
+                                                     (__v4sf) __B,
+                                                     -(__v4sf) __C,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fmsubadd_ps (__m128 __A, __m128 __B, __m128 __C,
+                      __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfmsubaddps128_mask3 ((__v4sf) __A,
+                                                      (__v4sf) __B,
+                                                      (__v4sf) __C,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fmsubadd_ps (__mmask8 __U, __m128 __A, __m128 __B,
+                      __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps128_maskz ((__v4sf) __A,
+                                                      (__v4sf) __B,
+                                                      -(__v4sf) __C,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fnmadd_pd (__m256d __A, __mmask8 __U, __m256d __B,
+                      __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfnmaddpd256_mask ((__v4df) __A,
+                                                    (__v4df) __B,
+                                                    (__v4df) __C,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fnmadd_pd (__m256d __A, __m256d __B, __m256d __C,
+                       __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfnmaddpd256_mask3 ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fnmadd_pd (__mmask8 __U, __m256d __A, __m256d __B,
+                       __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfnmaddpd256_maskz ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmadd_pd (__m128d __A, __mmask8 __U, __m128d __B,
+                   __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfnmaddpd128_mask ((__v2df) __A,
+                                                    (__v2df) __B,
+                                                    (__v2df) __C,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmadd_pd (__m128d __A, __m128d __B, __m128d __C,
+                    __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfnmaddpd128_mask3 ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmadd_pd (__mmask8 __U, __m128d __A, __m128d __B,
+                    __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfnmaddpd128_maskz ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fnmadd_ps (__m256 __A, __mmask8 __U, __m256 __B,
+                      __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfnmaddps256_mask ((__v8sf) __A,
+                                                   (__v8sf) __B,
+                                                   (__v8sf) __C,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fnmadd_ps (__m256 __A, __m256 __B, __m256 __C,
+                       __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfnmaddps256_mask3 ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fnmadd_ps (__mmask8 __U, __m256 __A, __m256 __B,
+                       __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfnmaddps256_maskz ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmadd_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfnmaddps128_mask ((__v4sf) __A,
+                                                   (__v4sf) __B,
+                                                   (__v4sf) __C,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmadd_ps (__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfnmaddps128_mask3 ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmadd_ps (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfnmaddps128_maskz ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fnmsub_pd (__m256d __A, __mmask8 __U, __m256d __B,
+                      __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfnmsubpd256_mask ((__v4df) __A,
+                                                    (__v4df) __B,
+                                                    (__v4df) __C,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fnmsub_pd (__m256d __A, __m256d __B, __m256d __C,
+                       __mmask8 __U)
+{
+  return (__m256d) __builtin_ia32_vfnmsubpd256_mask3 ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fnmsub_pd (__mmask8 __U, __m256d __A, __m256d __B,
+                       __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfnmsubpd256_maskz ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4df) __C,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmsub_pd (__m128d __A, __mmask8 __U, __m128d __B,
+                   __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfnmsubpd128_mask ((__v2df) __A,
+                                                    (__v2df) __B,
+                                                    (__v2df) __C,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmsub_pd (__m128d __A, __m128d __B, __m128d __C,
+                    __mmask8 __U)
+{
+  return (__m128d) __builtin_ia32_vfnmsubpd128_mask3 ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmsub_pd (__mmask8 __U, __m128d __A, __m128d __B,
+                    __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfnmsubpd128_maskz ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2df) __C,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fnmsub_ps (__m256 __A, __mmask8 __U, __m256 __B,
+                      __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfnmsubps256_mask ((__v8sf) __A,
+                                                   (__v8sf) __B,
+                                                   (__v8sf) __C,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask3_fnmsub_ps (__m256 __A, __m256 __B, __m256 __C,
+                       __mmask8 __U)
+{
+  return (__m256) __builtin_ia32_vfnmsubps256_mask3 ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fnmsub_ps (__mmask8 __U, __m256 __A, __m256 __B,
+                       __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfnmsubps256_maskz ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8sf) __C,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fnmsub_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfnmsubps128_mask ((__v4sf) __A,
+                                                   (__v4sf) __B,
+                                                   (__v4sf) __C,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask3_fnmsub_ps (__m128 __A, __m128 __B, __m128 __C, __mmask8 __U)
+{
+  return (__m128) __builtin_ia32_vfnmsubps128_mask3 ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fnmsub_ps (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfnmsubps128_maskz ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4sf) __C,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_and_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pandd128_mask ((__v4si) __A,
+                                                (__v4si) __B,
+                                                (__v4si) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_and_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pandd128_mask ((__v4si) __A,
+                                                (__v4si) __B,
+                                                (__v4si)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_andnot_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+                         __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pandnd256_mask ((__v8si) __A,
+                                                 (__v8si) __B,
+                                                 (__v8si) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_andnot_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pandnd256_mask ((__v8si) __A,
+                                                 (__v8si) __B,
+                                                 (__v8si)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_andnot_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+                      __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pandnd128_mask ((__v4si) __A,
+                                                 (__v4si) __B,
+                                                 (__v4si) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_andnot_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pandnd128_mask ((__v4si) __A,
+                                                 (__v4si) __B,
+                                                 (__v4si)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_or_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+                     __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pord256_mask ((__v8si) __A,
+                                               (__v8si) __B,
+                                               (__v8si) __W,
+                                               (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_or_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pord256_mask ((__v8si) __A,
+                                               (__v8si) __B,
+                                               (__v8si)
+                                               _mm256_setzero_si256 (),
+                                               (__mmask8) __U);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_or_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i) ((__v8su)__A | (__v8su)__B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_or_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pord128_mask ((__v4si) __A,
+                                               (__v4si) __B,
+                                               (__v4si) __W,
+                                               (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_or_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pord128_mask ((__v4si) __A,
+                                               (__v4si) __B,
+                                               (__v4si)
+                                               _mm_setzero_si128 (),
+                                               (__mmask8) __U);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_or_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v4su)__A | (__v4su)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_xor_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+                      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pxord256_mask ((__v8si) __A,
+                                                (__v8si) __B,
+                                                (__v8si) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_xor_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pxord256_mask ((__v8si) __A,
+                                                (__v8si) __B,
+                                                (__v8si)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_xor_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i) ((__v8su)__A ^ (__v8su)__B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_xor_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pxord128_mask ((__v4si) __A,
+                                                (__v4si) __B,
+                                                (__v4si) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_xor_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pxord128_mask ((__v4si) __A,
+                                                (__v4si) __B,
+                                                (__v4si)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_xor_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v4su)__A ^ (__v4su)__B);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m128d __A)
+{
+  return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A,
+                                               (__v4sf) __W,
+                                               (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtpd_ps (__mmask8 __U, __m128d __A)
+{
+  return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A,
+                                               (__v4sf)
+                                               _mm_setzero_ps (),
+                                               (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m256d __A)
+{
+  return (__m128) __builtin_ia32_cvtpd2ps256_mask ((__v4df) __A,
+                                                  (__v4sf) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtpd_ps (__mmask8 __U, __m256d __A)
+{
+  return (__m128) __builtin_ia32_cvtpd2ps256_mask ((__v4df) __A,
+                                                  (__v4sf)
+                                                  _mm_setzero_ps (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtps_epi32 (__m256i __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256i) __builtin_ia32_cvtps2dq256_mask ((__v8sf) __A,
+                                                   (__v8si) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtps_epi32 (__mmask8 __U, __m256 __A)
+{
+  return (__m256i) __builtin_ia32_cvtps2dq256_mask ((__v8sf) __A,
+                                                   (__v8si)
+                                                   _mm256_setzero_si256 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtps_epi32 (__m128i __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128i) __builtin_ia32_cvtps2dq128_mask ((__v4sf) __A,
+                                                   (__v4si) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtps_epi32 (__mmask8 __U, __m128 __A)
+{
+  return (__m128i) __builtin_ia32_cvtps2dq128_mask ((__v4sf) __A,
+                                                   (__v4si)
+                                                   _mm_setzero_si128 (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtps_epu32 (__m256 __A)
+{
+  return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
+                                                    (__v8si)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtps_epu32 (__m256i __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
+                                                    (__v8si) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtps_epu32 (__mmask8 __U, __m256 __A)
+{
+  return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A,
+                                                    (__v8si)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_epu32 (__m128 __A)
+{
+  return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
+                                                    (__v4si)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtps_epu32 (__m128i __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
+                                                    (__v4si) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtps_epu32 (__mmask8 __U, __m128 __A)
+{
+  return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A,
+                                                    (__v4si)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_movedup_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_movddup256_mask ((__v4df) __A,
+                                                  (__v4df) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_movedup_pd (__mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_movddup256_mask ((__v4df) __A,
+                                                  (__v4df)
+                                                  _mm256_setzero_pd (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_movedup_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_movddup128_mask ((__v2df) __A,
+                                                  (__v2df) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_movedup_pd (__mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_movddup128_mask ((__v2df) __A,
+                                                  (__v2df)
+                                                  _mm_setzero_pd (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_movehdup_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_movshdup256_mask ((__v8sf) __A,
+                                                  (__v8sf) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_movehdup_ps (__mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_movshdup256_mask ((__v8sf) __A,
+                                                  (__v8sf)
+                                                  _mm256_setzero_ps (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_movehdup_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_movshdup128_mask ((__v4sf) __A,
+                                                  (__v4sf) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_movehdup_ps (__mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_movshdup128_mask ((__v4sf) __A,
+                                                  (__v4sf)
+                                                  _mm_setzero_ps (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_moveldup_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_movsldup256_mask ((__v8sf) __A,
+                                                  (__v8sf) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_moveldup_ps (__mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_movsldup256_mask ((__v8sf) __A,
+                                                  (__v8sf)
+                                                  _mm256_setzero_ps (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_moveldup_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_movsldup128_mask ((__v4sf) __A,
+                                                  (__v4sf) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_moveldup_ps (__mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_movsldup128_mask ((__v4sf) __A,
+                                                  (__v4sf)
+                                                  _mm_setzero_ps (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_unpackhi_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+                        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_punpckhdq128_mask ((__v4si) __A,
+                                                    (__v4si) __B,
+                                                    (__v4si) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_unpackhi_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_punpckhdq128_mask ((__v4si) __A,
+                                                    (__v4si) __B,
+                                                    (__v4si)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_unpackhi_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+                           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_punpckhdq256_mask ((__v8si) __A,
+                                                    (__v8si) __B,
+                                                    (__v8si) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_unpackhi_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_punpckhdq256_mask ((__v8si) __A,
+                                                    (__v8si) __B,
+                                                    (__v8si)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_unpackhi_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+                        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_punpckhqdq128_mask ((__v2di) __A,
+                                                     (__v2di) __B,
+                                                     (__v2di) __W,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_unpackhi_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_punpckhqdq128_mask ((__v2di) __A,
+                                                     (__v2di) __B,
+                                                     (__v2di)
+                                                     _mm_setzero_si128 (),
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_unpackhi_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+                           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_punpckhqdq256_mask ((__v4di) __A,
+                                                     (__v4di) __B,
+                                                     (__v4di) __W,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_unpackhi_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_punpckhqdq256_mask ((__v4di) __A,
+                                                     (__v4di) __B,
+                                                     (__v4di)
+                                                     _mm256_setzero_si256 (),
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_unpacklo_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+                        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_punpckldq128_mask ((__v4si) __A,
+                                                    (__v4si) __B,
+                                                    (__v4si) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_unpacklo_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_punpckldq128_mask ((__v4si) __A,
+                                                    (__v4si) __B,
+                                                    (__v4si)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_unpacklo_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+                           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_punpckldq256_mask ((__v8si) __A,
+                                                    (__v8si) __B,
+                                                    (__v8si) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_unpacklo_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_punpckldq256_mask ((__v8si) __A,
+                                                    (__v8si) __B,
+                                                    (__v8si)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_unpacklo_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+                        __m128i __B)
+{
+  return (__m128i) __builtin_ia32_punpcklqdq128_mask ((__v2di) __A,
+                                                     (__v2di) __B,
+                                                     (__v2di) __W,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_unpacklo_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_punpcklqdq128_mask ((__v2di) __A,
+                                                     (__v2di) __B,
+                                                     (__v2di)
+                                                     _mm_setzero_si128 (),
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_unpacklo_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+                           __m256i __B)
+{
+  return (__m256i) __builtin_ia32_punpcklqdq256_mask ((__v4di) __A,
+                                                     (__v4di) __B,
+                                                     (__v4di) __W,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_unpacklo_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_punpcklqdq256_mask ((__v4di) __A,
+                                                     (__v4di) __B,
+                                                     (__v4di)
+                                                     _mm256_setzero_si256 (),
+                                                     (__mmask8) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epu32_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __A,
+                                                  (__v4si) __B, 0,
+                                                  (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi32_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_pcmpeqd128_mask ((__v4si) __A,
+                                                   (__v4si) __B,
+                                                   (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpeq_epu32_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __A,
+                                                  (__v4si) __B, 0, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpeq_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_pcmpeqd128_mask ((__v4si) __A,
+                                                   (__v4si) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpeq_epu32_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __A,
+                                                  (__v8si) __B, 0,
+                                                  (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpeq_epi32_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_pcmpeqd256_mask ((__v8si) __A,
+                                                   (__v8si) __B,
+                                                   (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpeq_epu32_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __A,
+                                                  (__v8si) __B, 0, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpeq_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_pcmpeqd256_mask ((__v8si) __A,
+                                                   (__v8si) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epu64_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __A,
+                                                  (__v2di) __B, 0,
+                                                  (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi64_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_pcmpeqq128_mask ((__v2di) __A,
+                                                   (__v2di) __B,
+                                                   (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpeq_epu64_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __A,
+                                                  (__v2di) __B, 0, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpeq_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_pcmpeqq128_mask ((__v2di) __A,
+                                                   (__v2di) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpeq_epu64_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __A,
+                                                  (__v4di) __B, 0,
+                                                  (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpeq_epi64_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_pcmpeqq256_mask ((__v4di) __A,
+                                                   (__v4di) __B,
+                                                   (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpeq_epu64_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __A,
+                                                  (__v4di) __B, 0, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpeq_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_pcmpeqq256_mask ((__v4di) __A,
+                                                   (__v4di) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epu32_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __A,
+                                                  (__v4si) __B, 6,
+                                                  (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi32_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_pcmpgtd128_mask ((__v4si) __A,
+                                                   (__v4si) __B,
+                                                   (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpgt_epu32_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __A,
+                                                  (__v4si) __B, 6, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpgt_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_pcmpgtd128_mask ((__v4si) __A,
+                                                   (__v4si) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpgt_epu32_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __A,
+                                                  (__v8si) __B, 6,
+                                                  (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpgt_epi32_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_pcmpgtd256_mask ((__v8si) __A,
+                                                   (__v8si) __B,
+                                                   (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpgt_epu32_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __A,
+                                                  (__v8si) __B, 6, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpgt_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_pcmpgtd256_mask ((__v8si) __A,
+                                                   (__v8si) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epu64_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __A,
+                                                  (__v2di) __B, 6,
+                                                  (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi64_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_pcmpgtq128_mask ((__v2di) __A,
+                                                   (__v2di) __B,
+                                                   (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpgt_epu64_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __A,
+                                                  (__v2di) __B, 6, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpgt_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_pcmpgtq128_mask ((__v2di) __A,
+                                                   (__v2di) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpgt_epu64_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __A,
+                                                  (__v4di) __B, 6,
+                                                  (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpgt_epi64_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_pcmpgtq256_mask ((__v4di) __A,
+                                                   (__v4di) __B,
+                                                   (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpgt_epu64_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __A,
+                                                  (__v4di) __B, 6, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpgt_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_pcmpgtq256_mask ((__v4di) __A,
+                                                   (__v4di) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_test_epi32_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmd128 ((__v4si) __A,
+                                              (__v4si) __B,
+                                              (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_test_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmd128 ((__v4si) __A,
+                                              (__v4si) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_test_epi32_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmd256 ((__v8si) __A,
+                                              (__v8si) __B,
+                                              (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_test_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmd256 ((__v8si) __A,
+                                              (__v8si) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_test_epi64_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmq128 ((__v2di) __A,
+                                              (__v2di) __B,
+                                              (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_test_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmq128 ((__v2di) __A,
+                                              (__v2di) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_test_epi64_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmq256 ((__v4di) __A,
+                                              (__v4di) __B,
+                                              (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_test_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestmq256 ((__v4di) __A,
+                                              (__v4di) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testn_epi32_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmd128 ((__v4si) __A,
+                                               (__v4si) __B,
+                                               (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_testn_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmd128 ((__v4si) __A,
+                                               (__v4si) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testn_epi32_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmd256 ((__v8si) __A,
+                                               (__v8si) __B,
+                                               (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_testn_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmd256 ((__v8si) __A,
+                                               (__v8si) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testn_epi64_mask (__m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmq128 ((__v2di) __A,
+                                               (__v2di) __B,
+                                               (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_testn_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmq128 ((__v2di) __A,
+                                               (__v2di) __B, __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testn_epi64_mask (__m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmq256 ((__v4di) __A,
+                                               (__v4di) __B,
+                                               (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_testn_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__mmask8) __builtin_ia32_ptestnmq256 ((__v4di) __A,
+                                               (__v4di) __B, __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_compress_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A,
+                                                     (__v4df) __W,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_compress_pd (__mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A,
+                                                     (__v4df)
+                                                     _mm256_setzero_pd (),
+                                                     (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m256d __A)
+{
+  __builtin_ia32_compressstoredf256_mask ((__v4df *) __P,
+                                         (__v4df) __A,
+                                         (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_compress_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A,
+                                                     (__v2df) __W,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_compress_pd (__mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A,
+                                                     (__v2df)
+                                                     _mm_setzero_pd (),
+                                                     (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m128d __A)
+{
+  __builtin_ia32_compressstoredf128_mask ((__v2df *) __P,
+                                         (__v2df) __A,
+                                         (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_compress_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A,
+                                                    (__v8sf) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_compress_ps (__mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A,
+                                                    (__v8sf)
+                                                    _mm256_setzero_ps (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m256 __A)
+{
+  __builtin_ia32_compressstoresf256_mask ((__v8sf *) __P,
+                                         (__v8sf) __A,
+                                         (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_compress_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A,
+                                                    (__v4sf) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_compress_ps (__mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A,
+                                                    (__v4sf)
+                                                    _mm_setzero_ps (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m128 __A)
+{
+  __builtin_ia32_compressstoresf128_mask ((__v4sf *) __P,
+                                         (__v4sf) __A,
+                                         (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_compress_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A,
+                                                     (__v4di) __W,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_compress_epi64 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A,
+                                                     (__v4di)
+                                                     _mm256_setzero_si256 (),
+                                                     (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m256i __A)
+{
+  __builtin_ia32_compressstoredi256_mask ((__v4di *) __P,
+                                         (__v4di) __A,
+                                         (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_compress_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A,
+                                                     (__v2di) __W,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_compress_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A,
+                                                     (__v2di)
+                                                     _mm_setzero_si128 (),
+                                                     (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m128i __A)
+{
+  __builtin_ia32_compressstoredi128_mask ((__v2di *) __P,
+                                         (__v2di) __A,
+                                         (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_compress_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A,
+                                                     (__v8si) __W,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_compress_epi32 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A,
+                                                     (__v8si)
+                                                     _mm256_setzero_si256 (),
+                                                     (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m256i __A)
+{
+  __builtin_ia32_compressstoresi256_mask ((__v8si *) __P,
+                                         (__v8si) __A,
+                                         (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_compress_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A,
+                                                     (__v4si) __W,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_compress_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A,
+                                                     (__v4si)
+                                                     _mm_setzero_si128 (),
+                                                     (__mmask8) __U);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m128i __A)
+{
+  __builtin_ia32_compressstoresi128_mask ((__v4si *) __P,
+                                         (__v4si) __A,
+                                         (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_expand_pd (__m256d __W, __mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_expanddf256_mask ((__v4df) __A,
+                                                   (__v4df) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_expand_pd (__mmask8 __U, __m256d __A)
+{
+  return (__m256d) __builtin_ia32_expanddf256_maskz ((__v4df) __A,
+                                                    (__v4df)
+                                                    _mm256_setzero_pd (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_expandloadu_pd (__m256d __W, __mmask8 __U, void const *__P)
+{
+  return (__m256d) __builtin_ia32_expandloaddf256_mask ((__v4df *) __P,
+                                                       (__v4df) __W,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_expandloadu_pd (__mmask8 __U, void const *__P)
+{
+  return (__m256d) __builtin_ia32_expandloaddf256_maskz ((__v4df *) __P,
+                                                        (__v4df)
+                                                        _mm256_setzero_pd (),
+                                                        (__mmask8)
+                                                        __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_expand_pd (__m128d __W, __mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_expanddf128_mask ((__v2df) __A,
+                                                   (__v2df) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_expand_pd (__mmask8 __U, __m128d __A)
+{
+  return (__m128d) __builtin_ia32_expanddf128_maskz ((__v2df) __A,
+                                                    (__v2df)
+                                                    _mm_setzero_pd (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_expandloadu_pd (__m128d __W, __mmask8 __U, void const *__P)
+{
+  return (__m128d) __builtin_ia32_expandloaddf128_mask ((__v2df *) __P,
+                                                       (__v2df) __W,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_expandloadu_pd (__mmask8 __U, void const *__P)
+{
+  return (__m128d) __builtin_ia32_expandloaddf128_maskz ((__v2df *) __P,
+                                                        (__v2df)
+                                                        _mm_setzero_pd (),
+                                                        (__mmask8)
+                                                        __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_expand_ps (__m256 __W, __mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_expandsf256_mask ((__v8sf) __A,
+                                                  (__v8sf) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_expand_ps (__mmask8 __U, __m256 __A)
+{
+  return (__m256) __builtin_ia32_expandsf256_maskz ((__v8sf) __A,
+                                                   (__v8sf)
+                                                   _mm256_setzero_ps (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_expandloadu_ps (__m256 __W, __mmask8 __U, void const *__P)
+{
+  return (__m256) __builtin_ia32_expandloadsf256_mask ((__v8sf *) __P,
+                                                      (__v8sf) __W,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_expandloadu_ps (__mmask8 __U, void const *__P)
+{
+  return (__m256) __builtin_ia32_expandloadsf256_maskz ((__v8sf *) __P,
+                                                       (__v8sf)
+                                                       _mm256_setzero_ps (),
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_expand_ps (__m128 __W, __mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_expandsf128_mask ((__v4sf) __A,
+                                                  (__v4sf) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_expand_ps (__mmask8 __U, __m128 __A)
+{
+  return (__m128) __builtin_ia32_expandsf128_maskz ((__v4sf) __A,
+                                                   (__v4sf)
+                                                   _mm_setzero_ps (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_expandloadu_ps (__m128 __W, __mmask8 __U, void const *__P)
+{
+  return (__m128) __builtin_ia32_expandloadsf128_mask ((__v4sf *) __P,
+                                                      (__v4sf) __W,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_expandloadu_ps (__mmask8 __U, void const *__P)
+{
+  return (__m128) __builtin_ia32_expandloadsf128_maskz ((__v4sf *) __P,
+                                                       (__v4sf)
+                                                       _mm_setzero_ps (),
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_expand_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_expanddi256_mask ((__v4di) __A,
+                                                   (__v4di) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_expand_epi64 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_expanddi256_maskz ((__v4di) __A,
+                                                    (__v4di)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_expandloadu_epi64 (__m256i __W, __mmask8 __U,
+                              void const *__P)
+{
+  return (__m256i) __builtin_ia32_expandloaddi256_mask ((__v4di *) __P,
+                                                       (__v4di) __W,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_expandloaddi256_maskz ((__v4di *) __P,
+                                                        (__v4di)
+                                                        _mm256_setzero_si256 (),
+                                                        (__mmask8)
+                                                        __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_expand_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_expanddi128_mask ((__v2di) __A,
+                                                   (__v2di) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_expand_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_expanddi128_maskz ((__v2di) __A,
+                                                    (__v2di)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_expandloadu_epi64 (__m128i __W, __mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_expandloaddi128_mask ((__v2di *) __P,
+                                                       (__v2di) __W,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_expandloaddi128_maskz ((__v2di *) __P,
+                                                        (__v2di)
+                                                        _mm_setzero_si128 (),
+                                                        (__mmask8)
+                                                        __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_expand_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_expandsi256_mask ((__v8si) __A,
+                                                   (__v8si) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_expand_epi32 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_expandsi256_maskz ((__v8si) __A,
+                                                    (__v8si)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_expandloadu_epi32 (__m256i __W, __mmask8 __U,
+                              void const *__P)
+{
+  return (__m256i) __builtin_ia32_expandloadsi256_mask ((__v8si *) __P,
+                                                       (__v8si) __W,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P)
+{
+  return (__m256i) __builtin_ia32_expandloadsi256_maskz ((__v8si *) __P,
+                                                        (__v8si)
+                                                        _mm256_setzero_si256 (),
+                                                        (__mmask8)
+                                                        __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_expand_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_expandsi128_mask ((__v4si) __A,
+                                                   (__v4si) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_expand_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_expandsi128_maskz ((__v4si) __A,
+                                                    (__v4si)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_expandloadu_epi32 (__m128i __W, __mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_expandloadsi128_mask ((__v4si *) __P,
+                                                       (__v4si) __W,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P)
+{
+  return (__m128i) __builtin_ia32_expandloadsi128_maskz ((__v4si *) __P,
+                                                        (__v4si)
+                                                        _mm_setzero_si128 (),
+                                                        (__mmask8)
+                                                        __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutex2var_pd (__m256d __A, __m256i __I, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_vpermt2varpd256_mask ((__v4di) __I
+                                                       /* idx */ ,
+                                                       (__v4df) __A,
+                                                       (__v4df) __B,
+                                                       (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutex2var_pd (__m256d __A, __mmask8 __U, __m256i __I,
+                            __m256d __B)
+{
+  return (__m256d) __builtin_ia32_vpermt2varpd256_mask ((__v4di) __I
+                                                       /* idx */ ,
+                                                       (__v4df) __A,
+                                                       (__v4df) __B,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask2_permutex2var_pd (__m256d __A, __m256i __I, __mmask8 __U,
+                             __m256d __B)
+{
+  return (__m256d) __builtin_ia32_vpermi2varpd256_mask ((__v4df) __A,
+                                                       (__v4di) __I
+                                                       /* idx */ ,
+                                                       (__v4df) __B,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutex2var_pd (__mmask8 __U, __m256d __A, __m256i __I,
+                             __m256d __B)
+{
+  return (__m256d) __builtin_ia32_vpermt2varpd256_maskz ((__v4di) __I
+                                                        /* idx */ ,
+                                                        (__v4df) __A,
+                                                        (__v4df) __B,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutex2var_ps (__m256 __A, __m256i __I, __m256 __B)
+{
+  return (__m256) __builtin_ia32_vpermt2varps256_mask ((__v8si) __I
+                                                      /* idx */ ,
+                                                      (__v8sf) __A,
+                                                      (__v8sf) __B,
+                                                      (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutex2var_ps (__m256 __A, __mmask8 __U, __m256i __I,
+                            __m256 __B)
+{
+  return (__m256) __builtin_ia32_vpermt2varps256_mask ((__v8si) __I
+                                                      /* idx */ ,
+                                                      (__v8sf) __A,
+                                                      (__v8sf) __B,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask2_permutex2var_ps (__m256 __A, __m256i __I, __mmask8 __U,
+                             __m256 __B)
+{
+  return (__m256) __builtin_ia32_vpermi2varps256_mask ((__v8sf) __A,
+                                                      (__v8si) __I
+                                                      /* idx */ ,
+                                                      (__v8sf) __B,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutex2var_ps (__mmask8 __U, __m256 __A, __m256i __I,
+                             __m256 __B)
+{
+  return (__m256) __builtin_ia32_vpermt2varps256_maskz ((__v8si) __I
+                                                       /* idx */ ,
+                                                       (__v8sf) __A,
+                                                       (__v8sf) __B,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutex2var_epi64 (__m128i __A, __m128i __I, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varq128_mask ((__v2di) __I
+                                                      /* idx */ ,
+                                                      (__v2di) __A,
+                                                      (__v2di) __B,
+                                                      (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_permutex2var_epi64 (__m128i __A, __mmask8 __U, __m128i __I,
+                            __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varq128_mask ((__v2di) __I
+                                                      /* idx */ ,
+                                                      (__v2di) __A,
+                                                      (__v2di) __B,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask2_permutex2var_epi64 (__m128i __A, __m128i __I, __mmask8 __U,
+                             __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermi2varq128_mask ((__v2di) __A,
+                                                      (__v2di) __I
+                                                      /* idx */ ,
+                                                      (__v2di) __B,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_permutex2var_epi64 (__mmask8 __U, __m128i __A, __m128i __I,
+                             __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2varq128_maskz ((__v2di) __I
+                                                       /* idx */ ,
+                                                       (__v2di) __A,
+                                                       (__v2di) __B,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutex2var_epi32 (__m128i __A, __m128i __I, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2vard128_mask ((__v4si) __I
+                                                      /* idx */ ,
+                                                      (__v4si) __A,
+                                                      (__v4si) __B,
+                                                      (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_permutex2var_epi32 (__m128i __A, __mmask8 __U, __m128i __I,
+                            __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2vard128_mask ((__v4si) __I
+                                                      /* idx */ ,
+                                                      (__v4si) __A,
+                                                      (__v4si) __B,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask2_permutex2var_epi32 (__m128i __A, __m128i __I, __mmask8 __U,
+                             __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermi2vard128_mask ((__v4si) __A,
+                                                      (__v4si) __I
+                                                      /* idx */ ,
+                                                      (__v4si) __B,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_permutex2var_epi32 (__mmask8 __U, __m128i __A, __m128i __I,
+                             __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpermt2vard128_maskz ((__v4si) __I
+                                                       /* idx */ ,
+                                                       (__v4si) __A,
+                                                       (__v4si) __B,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutex2var_epi64 (__m256i __A, __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varq256_mask ((__v4di) __I
+                                                      /* idx */ ,
+                                                      (__v4di) __A,
+                                                      (__v4di) __B,
+                                                      (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutex2var_epi64 (__m256i __A, __mmask8 __U, __m256i __I,
+                               __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varq256_mask ((__v4di) __I
+                                                      /* idx */ ,
+                                                      (__v4di) __A,
+                                                      (__v4di) __B,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask2_permutex2var_epi64 (__m256i __A, __m256i __I,
+                                __mmask8 __U, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermi2varq256_mask ((__v4di) __A,
+                                                      (__v4di) __I
+                                                      /* idx */ ,
+                                                      (__v4di) __B,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutex2var_epi64 (__mmask8 __U, __m256i __A,
+                                __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2varq256_maskz ((__v4di) __I
+                                                       /* idx */ ,
+                                                       (__v4di) __A,
+                                                       (__v4di) __B,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutex2var_epi32 (__m256i __A, __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2vard256_mask ((__v8si) __I
+                                                      /* idx */ ,
+                                                      (__v8si) __A,
+                                                      (__v8si) __B,
+                                                      (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutex2var_epi32 (__m256i __A, __mmask8 __U, __m256i __I,
+                               __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2vard256_mask ((__v8si) __I
+                                                      /* idx */ ,
+                                                      (__v8si) __A,
+                                                      (__v8si) __B,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask2_permutex2var_epi32 (__m256i __A, __m256i __I,
+                                __mmask8 __U, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermi2vard256_mask ((__v8si) __A,
+                                                      (__v8si) __I
+                                                      /* idx */ ,
+                                                      (__v8si) __B,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutex2var_epi32 (__mmask8 __U, __m256i __A,
+                                __m256i __I, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vpermt2vard256_maskz ((__v8si) __I
+                                                       /* idx */ ,
+                                                       (__v8si) __A,
+                                                       (__v8si) __B,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutex2var_pd (__m128d __A, __m128i __I, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_vpermt2varpd128_mask ((__v2di) __I
+                                                       /* idx */ ,
+                                                       (__v2df) __A,
+                                                       (__v2df) __B,
+                                                       (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_permutex2var_pd (__m128d __A, __mmask8 __U, __m128i __I,
+                         __m128d __B)
+{
+  return (__m128d) __builtin_ia32_vpermt2varpd128_mask ((__v2di) __I
+                                                       /* idx */ ,
+                                                       (__v2df) __A,
+                                                       (__v2df) __B,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask2_permutex2var_pd (__m128d __A, __m128i __I, __mmask8 __U,
+                          __m128d __B)
+{
+  return (__m128d) __builtin_ia32_vpermi2varpd128_mask ((__v2df) __A,
+                                                       (__v2di) __I
+                                                       /* idx */ ,
+                                                       (__v2df) __B,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_permutex2var_pd (__mmask8 __U, __m128d __A, __m128i __I,
+                          __m128d __B)
+{
+  return (__m128d) __builtin_ia32_vpermt2varpd128_maskz ((__v2di) __I
+                                                        /* idx */ ,
+                                                        (__v2df) __A,
+                                                        (__v2df) __B,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutex2var_ps (__m128 __A, __m128i __I, __m128 __B)
+{
+  return (__m128) __builtin_ia32_vpermt2varps128_mask ((__v4si) __I
+                                                      /* idx */ ,
+                                                      (__v4sf) __A,
+                                                      (__v4sf) __B,
+                                                      (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_permutex2var_ps (__m128 __A, __mmask8 __U, __m128i __I,
+                         __m128 __B)
+{
+  return (__m128) __builtin_ia32_vpermt2varps128_mask ((__v4si) __I
+                                                      /* idx */ ,
+                                                      (__v4sf) __A,
+                                                      (__v4sf) __B,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask2_permutex2var_ps (__m128 __A, __m128i __I, __mmask8 __U,
+                          __m128 __B)
+{
+  return (__m128) __builtin_ia32_vpermi2varps128_mask ((__v4sf) __A,
+                                                      (__v4si) __I
+                                                      /* idx */ ,
+                                                      (__v4sf) __B,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_permutex2var_ps (__mmask8 __U, __m128 __A, __m128i __I,
+                          __m128 __B)
+{
+  return (__m128) __builtin_ia32_vpermt2varps128_maskz ((__v4si) __I
+                                                       /* idx */ ,
+                                                       (__v4sf) __A,
+                                                       (__v4sf) __B,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srav_epi64 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X,
+                                                 (__v2di) __Y,
+                                                 (__v2di)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_srav_epi64 (__m128i __W, __mmask8 __U, __m128i __X,
+                    __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X,
+                                                 (__v2di) __Y,
+                                                 (__v2di) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_srav_epi64 (__mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X,
+                                                 (__v2di) __Y,
+                                                 (__v2di)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sllv_epi32 (__m256i __W, __mmask8 __U, __m256i __X,
+                       __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psllv8si_mask ((__v8si) __X,
+                                                (__v8si) __Y,
+                                                (__v8si) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sllv_epi32 (__mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psllv8si_mask ((__v8si) __X,
+                                                (__v8si) __Y,
+                                                (__v8si)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sllv_epi32 (__m128i __W, __mmask8 __U, __m128i __X,
+                    __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psllv4si_mask ((__v4si) __X,
+                                                (__v4si) __Y,
+                                                (__v4si) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sllv_epi32 (__mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psllv4si_mask ((__v4si) __X,
+                                                (__v4si) __Y,
+                                                (__v4si)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sllv_epi64 (__m256i __W, __mmask8 __U, __m256i __X,
+                       __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psllv4di_mask ((__v4di) __X,
+                                                (__v4di) __Y,
+                                                (__v4di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sllv_epi64 (__mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psllv4di_mask ((__v4di) __X,
+                                                (__v4di) __Y,
+                                                (__v4di)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sllv_epi64 (__m128i __W, __mmask8 __U, __m128i __X,
+                    __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psllv2di_mask ((__v2di) __X,
+                                                (__v2di) __Y,
+                                                (__v2di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sllv_epi64 (__mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psllv2di_mask ((__v2di) __X,
+                                                (__v2di) __Y,
+                                                (__v2di)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_srav_epi32 (__m256i __W, __mmask8 __U, __m256i __X,
+                       __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psrav8si_mask ((__v8si) __X,
+                                                (__v8si) __Y,
+                                                (__v8si) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_srav_epi32 (__mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psrav8si_mask ((__v8si) __X,
+                                                (__v8si) __Y,
+                                                (__v8si)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_srav_epi32 (__m128i __W, __mmask8 __U, __m128i __X,
+                    __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psrav4si_mask ((__v4si) __X,
+                                                (__v4si) __Y,
+                                                (__v4si) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_srav_epi32 (__mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psrav4si_mask ((__v4si) __X,
+                                                (__v4si) __Y,
+                                                (__v4si)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_srlv_epi32 (__m256i __W, __mmask8 __U, __m256i __X,
+                       __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psrlv8si_mask ((__v8si) __X,
+                                                (__v8si) __Y,
+                                                (__v8si) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_srlv_epi32 (__mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psrlv8si_mask ((__v8si) __X,
+                                                (__v8si) __Y,
+                                                (__v8si)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_srlv_epi32 (__m128i __W, __mmask8 __U, __m128i __X,
+                    __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psrlv4si_mask ((__v4si) __X,
+                                                (__v4si) __Y,
+                                                (__v4si) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_srlv_epi32 (__mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psrlv4si_mask ((__v4si) __X,
+                                                (__v4si) __Y,
+                                                (__v4si)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_srlv_epi64 (__m256i __W, __mmask8 __U, __m256i __X,
+                       __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psrlv4di_mask ((__v4di) __X,
+                                                (__v4di) __Y,
+                                                (__v4di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_srlv_epi64 (__mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psrlv4di_mask ((__v4di) __X,
+                                                (__v4di) __Y,
+                                                (__v4di)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_srlv_epi64 (__m128i __W, __mmask8 __U, __m128i __X,
+                    __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psrlv2di_mask ((__v2di) __X,
+                                                (__v2di) __Y,
+                                                (__v2di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_srlv_epi64 (__mmask8 __U, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psrlv2di_mask ((__v2di) __X,
+                                                (__v2di) __Y,
+                                                (__v2di)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rolv_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A,
+                                                 (__v8si) __B,
+                                                 (__v8si)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_rolv_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+                       __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A,
+                                                 (__v8si) __B,
+                                                 (__v8si) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_rolv_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A,
+                                                 (__v8si) __B,
+                                                 (__v8si)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rolv_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A,
+                                                 (__v4si) __B,
+                                                 (__v4si)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rolv_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+                    __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A,
+                                                 (__v4si) __B,
+                                                 (__v4si) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rolv_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A,
+                                                 (__v4si) __B,
+                                                 (__v4si)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rorv_epi32 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A,
+                                                 (__v8si) __B,
+                                                 (__v8si)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_rorv_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+                       __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A,
+                                                 (__v8si) __B,
+                                                 (__v8si) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_rorv_epi32 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A,
+                                                 (__v8si) __B,
+                                                 (__v8si)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rorv_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A,
+                                                 (__v4si) __B,
+                                                 (__v4si)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rorv_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+                    __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A,
+                                                 (__v4si) __B,
+                                                 (__v4si) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rorv_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A,
+                                                 (__v4si) __B,
+                                                 (__v4si)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rolv_epi64 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A,
+                                                 (__v4di) __B,
+                                                 (__v4di)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_rolv_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+                       __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A,
+                                                 (__v4di) __B,
+                                                 (__v4di) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_rolv_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A,
+                                                 (__v4di) __B,
+                                                 (__v4di)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rolv_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A,
+                                                 (__v2di) __B,
+                                                 (__v2di)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rolv_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+                    __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A,
+                                                 (__v2di) __B,
+                                                 (__v2di) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rolv_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A,
+                                                 (__v2di) __B,
+                                                 (__v2di)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rorv_epi64 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A,
+                                                 (__v4di) __B,
+                                                 (__v4di)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_rorv_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+                       __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A,
+                                                 (__v4di) __B,
+                                                 (__v4di) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_rorv_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A,
+                                                 (__v4di) __B,
+                                                 (__v4di)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rorv_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A,
+                                                 (__v2di) __B,
+                                                 (__v2di)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rorv_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+                    __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A,
+                                                 (__v2di) __B,
+                                                 (__v2di) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rorv_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A,
+                                                 (__v2di) __B,
+                                                 (__v2di)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srav_epi64 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X,
+                                                 (__v4di) __Y,
+                                                 (__v4di)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_srav_epi64 (__m256i __W, __mmask8 __U, __m256i __X,
+                       __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X,
+                                                 (__v4di) __Y,
+                                                 (__v4di) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_srav_epi64 (__mmask8 __U, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X,
+                                                 (__v4di) __Y,
+                                                 (__v4di)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_and_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+                      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pandq256_mask ((__v4di) __A,
+                                                (__v4di) __B,
+                                                (__v4di) __W, __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_and_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pandq256_mask ((__v4di) __A,
+                                                (__v4di) __B,
+                                                (__v4di)
+                                                _mm256_setzero_pd (),
+                                                __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_and_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pandq128_mask ((__v2di) __A,
+                                                (__v2di) __B,
+                                                (__v2di) __W, __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_and_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pandq128_mask ((__v2di) __A,
+                                                (__v2di) __B,
+                                                (__v2di)
+                                                _mm_setzero_pd (),
+                                                __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_andnot_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+                         __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pandnq256_mask ((__v4di) __A,
+                                                 (__v4di) __B,
+                                                 (__v4di) __W, __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_andnot_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pandnq256_mask ((__v4di) __A,
+                                                 (__v4di) __B,
+                                                 (__v4di)
+                                                 _mm256_setzero_pd (),
+                                                 __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_andnot_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+                      __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pandnq128_mask ((__v2di) __A,
+                                                 (__v2di) __B,
+                                                 (__v2di) __W, __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_andnot_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pandnq128_mask ((__v2di) __A,
+                                                 (__v2di) __B,
+                                                 (__v2di)
+                                                 _mm_setzero_pd (),
+                                                 __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_or_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+                     __m256i __B)
+{
+  return (__m256i) __builtin_ia32_porq256_mask ((__v4di) __A,
+                                               (__v4di) __B,
+                                               (__v4di) __W,
+                                               (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_or_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_porq256_mask ((__v4di) __A,
+                                               (__v4di) __B,
+                                               (__v4di)
+                                               _mm256_setzero_si256 (),
+                                               (__mmask8) __U);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_or_epi64 (__m256i __A, __m256i __B)
+{
+  return (__m256i) ((__v4du)__A | (__v4du)__B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_or_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_porq128_mask ((__v2di) __A,
+                                               (__v2di) __B,
+                                               (__v2di) __W,
+                                               (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_or_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_porq128_mask ((__v2di) __A,
+                                               (__v2di) __B,
+                                               (__v2di)
+                                               _mm_setzero_si128 (),
+                                               (__mmask8) __U);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_or_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v2du)__A | (__v2du)__B);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_xor_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+                      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pxorq256_mask ((__v4di) __A,
+                                                (__v4di) __B,
+                                                (__v4di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_xor_epi64 (__mmask8 __U, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pxorq256_mask ((__v4di) __A,
+                                                (__v4di) __B,
+                                                (__v4di)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_xor_epi64 (__m256i __A, __m256i __B)
+{
+  return (__m256i) ((__v4du)__A ^ (__v4du)__B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_xor_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pxorq128_mask ((__v2di) __A,
+                                                (__v2di) __B,
+                                                (__v2di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_xor_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pxorq128_mask ((__v2di) __A,
+                                                (__v2di) __B,
+                                                (__v2di)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_xor_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v2du)__A ^ (__v2du)__B);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_max_pd (__m256d __W, __mmask8 __U, __m256d __A,
+                   __m256d __B)
+{
+  return (__m256d) __builtin_ia32_maxpd256_mask ((__v4df) __A,
+                                                (__v4df) __B,
+                                                (__v4df) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_max_pd (__mmask8 __U, __m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_maxpd256_mask ((__v4df) __A,
+                                                (__v4df) __B,
+                                                (__v4df)
+                                                _mm256_setzero_pd (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_max_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_maxps256_mask ((__v8sf) __A,
+                                               (__v8sf) __B,
+                                               (__v8sf) __W,
+                                               (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_max_ps (__mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_maxps256_mask ((__v8sf) __A,
+                                               (__v8sf) __B,
+                                               (__v8sf)
+                                               _mm256_setzero_ps (),
+                                               (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_div_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_divps_mask ((__v4sf) __A,
+                                            (__v4sf) __B,
+                                            (__v4sf) __W,
+                                            (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_div_ps (__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_divps_mask ((__v4sf) __A,
+                                            (__v4sf) __B,
+                                            (__v4sf)
+                                            _mm_setzero_ps (),
+                                            (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_div_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_divpd_mask ((__v2df) __A,
+                                             (__v2df) __B,
+                                             (__v2df) __W,
+                                             (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_div_pd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_divpd_mask ((__v2df) __A,
+                                             (__v2df) __B,
+                                             (__v2df)
+                                             _mm_setzero_pd (),
+                                             (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_min_pd (__m256d __W, __mmask8 __U, __m256d __A,
+                   __m256d __B)
+{
+  return (__m256d) __builtin_ia32_minpd256_mask ((__v4df) __A,
+                                                (__v4df) __B,
+                                                (__v4df) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_div_pd (__m256d __W, __mmask8 __U, __m256d __A,
+                   __m256d __B)
+{
+  return (__m256d) __builtin_ia32_divpd256_mask ((__v4df) __A,
+                                                (__v4df) __B,
+                                                (__v4df) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_min_pd (__mmask8 __U, __m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_minpd256_mask ((__v4df) __A,
+                                                (__v4df) __B,
+                                                (__v4df)
+                                                _mm256_setzero_pd (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_min_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_minps256_mask ((__v8sf) __A,
+                                               (__v8sf) __B,
+                                               (__v8sf) __W,
+                                               (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_div_pd (__mmask8 __U, __m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_divpd256_mask ((__v4df) __A,
+                                                (__v4df) __B,
+                                                (__v4df)
+                                                _mm256_setzero_pd (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_div_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_divps256_mask ((__v8sf) __A,
+                                               (__v8sf) __B,
+                                               (__v8sf) __W,
+                                               (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_min_ps (__mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_minps256_mask ((__v8sf) __A,
+                                               (__v8sf) __B,
+                                               (__v8sf)
+                                               _mm256_setzero_ps (),
+                                               (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_div_ps (__mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_divps256_mask ((__v8sf) __A,
+                                               (__v8sf) __B,
+                                               (__v8sf)
+                                               _mm256_setzero_ps (),
+                                               (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_minps_mask ((__v4sf) __A,
+                                            (__v4sf) __B,
+                                            (__v4sf) __W,
+                                            (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mul_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_mulps_mask ((__v4sf) __A,
+                                            (__v4sf) __B,
+                                            (__v4sf) __W,
+                                            (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_ps (__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_minps_mask ((__v4sf) __A,
+                                            (__v4sf) __B,
+                                            (__v4sf)
+                                            _mm_setzero_ps (),
+                                            (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mul_ps (__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_mulps_mask ((__v4sf) __A,
+                                            (__v4sf) __B,
+                                            (__v4sf)
+                                            _mm_setzero_ps (),
+                                            (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_maxps_mask ((__v4sf) __A,
+                                            (__v4sf) __B,
+                                            (__v4sf) __W,
+                                            (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_ps (__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_maxps_mask ((__v4sf) __A,
+                                            (__v4sf) __B,
+                                            (__v4sf)
+                                            _mm_setzero_ps (),
+                                            (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_minpd_mask ((__v2df) __A,
+                                             (__v2df) __B,
+                                             (__v2df) __W,
+                                             (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_pd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_minpd_mask ((__v2df) __A,
+                                             (__v2df) __B,
+                                             (__v2df)
+                                             _mm_setzero_pd (),
+                                             (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_maxpd_mask ((__v2df) __A,
+                                             (__v2df) __B,
+                                             (__v2df) __W,
+                                             (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_pd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_maxpd_mask ((__v2df) __A,
+                                             (__v2df) __B,
+                                             (__v2df)
+                                             _mm_setzero_pd (),
+                                             (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mul_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_mulpd_mask ((__v2df) __A,
+                                             (__v2df) __B,
+                                             (__v2df) __W,
+                                             (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mul_pd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_mulpd_mask ((__v2df) __A,
+                                             (__v2df) __B,
+                                             (__v2df)
+                                             _mm_setzero_pd (),
+                                             (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mul_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_mulps256_mask ((__v8sf) __A,
+                                               (__v8sf) __B,
+                                               (__v8sf) __W,
+                                               (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mul_ps (__mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_mulps256_mask ((__v8sf) __A,
+                                               (__v8sf) __B,
+                                               (__v8sf)
+                                               _mm256_setzero_ps (),
+                                               (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mul_pd (__m256d __W, __mmask8 __U, __m256d __A,
+                   __m256d __B)
+{
+  return (__m256d) __builtin_ia32_mulpd256_mask ((__v4df) __A,
+                                                (__v4df) __B,
+                                                (__v4df) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mul_pd (__mmask8 __U, __m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_mulpd256_mask ((__v4df) __A,
+                                                (__v4df) __B,
+                                                (__v4df)
+                                                _mm256_setzero_pd (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_max_epi64 (__mmask8 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A,
+                                                 (__v4di) __B,
+                                                 (__v4di)
+                                                 _mm256_setzero_si256 (),
+                                                 __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_max_epi64 (__m256i __W, __mmask8 __M, __m256i __A,
+                      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A,
+                                                 (__v4di) __B,
+                                                 (__v4di) __W, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_epi64 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A,
+                                                 (__v4di) __B,
+                                                 (__v4di)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_min_epi64 (__m256i __W, __mmask8 __M, __m256i __A,
+                      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A,
+                                                 (__v4di) __B,
+                                                 (__v4di) __W, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_min_epi64 (__mmask8 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A,
+                                                 (__v4di) __B,
+                                                 (__v4di)
+                                                 _mm256_setzero_si256 (),
+                                                 __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_max_epu64 (__mmask8 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A,
+                                                 (__v4di) __B,
+                                                 (__v4di)
+                                                 _mm256_setzero_si256 (),
+                                                 __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_epi64 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A,
+                                                 (__v4di) __B,
+                                                 (__v4di)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_epu64 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A,
+                                                 (__v4di) __B,
+                                                 (__v4di)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_max_epu64 (__m256i __W, __mmask8 __M, __m256i __A,
+                      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A,
+                                                 (__v4di) __B,
+                                                 (__v4di) __W, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_epu64 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A,
+                                                 (__v4di) __B,
+                                                 (__v4di)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_min_epu64 (__m256i __W, __mmask8 __M, __m256i __A,
+                      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A,
+                                                 (__v4di) __B,
+                                                 (__v4di) __W, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_min_epu64 (__mmask8 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A,
+                                                 (__v4di) __B,
+                                                 (__v4di)
+                                                 _mm256_setzero_si256 (),
+                                                 __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_max_epi32 (__mmask8 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxsd256_mask ((__v8si) __A,
+                                                 (__v8si) __B,
+                                                 (__v8si)
+                                                 _mm256_setzero_si256 (),
+                                                 __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_max_epi32 (__m256i __W, __mmask8 __M, __m256i __A,
+                      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxsd256_mask ((__v8si) __A,
+                                                 (__v8si) __B,
+                                                 (__v8si) __W, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_min_epi32 (__mmask8 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminsd256_mask ((__v8si) __A,
+                                                 (__v8si) __B,
+                                                 (__v8si)
+                                                 _mm256_setzero_si256 (),
+                                                 __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_min_epi32 (__m256i __W, __mmask8 __M, __m256i __A,
+                      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminsd256_mask ((__v8si) __A,
+                                                 (__v8si) __B,
+                                                 (__v8si) __W, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_max_epu32 (__mmask8 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxud256_mask ((__v8si) __A,
+                                                 (__v8si) __B,
+                                                 (__v8si)
+                                                 _mm256_setzero_si256 (),
+                                                 __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_max_epu32 (__m256i __W, __mmask8 __M, __m256i __A,
+                      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmaxud256_mask ((__v8si) __A,
+                                                 (__v8si) __B,
+                                                 (__v8si) __W, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_min_epu32 (__mmask8 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminud256_mask ((__v8si) __A,
+                                                 (__v8si) __B,
+                                                 (__v8si)
+                                                 _mm256_setzero_si256 (),
+                                                 __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_min_epu32 (__m256i __W, __mmask8 __M, __m256i __A,
+                      __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pminud256_mask ((__v8si) __A,
+                                                 (__v8si) __B,
+                                                 (__v8si) __W, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_epi64 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A,
+                                                 (__v2di) __B,
+                                                 (__v2di)
+                                                 _mm_setzero_si128 (),
+                                                 __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_epi64 (__m128i __W, __mmask8 __M, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A,
+                                                 (__v2di) __B,
+                                                 (__v2di) __W, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A,
+                                                 (__v2di) __B,
+                                                 (__v2di)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_epi64 (__m128i __W, __mmask8 __M, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A,
+                                                 (__v2di) __B,
+                                                 (__v2di) __W, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_epi64 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A,
+                                                 (__v2di) __B,
+                                                 (__v2di)
+                                                 _mm_setzero_si128 (),
+                                                 __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_epu64 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A,
+                                                 (__v2di) __B,
+                                                 (__v2di)
+                                                 _mm_setzero_si128 (),
+                                                 __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A,
+                                                 (__v2di) __B,
+                                                 (__v2di)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epu64 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A,
+                                                 (__v2di) __B,
+                                                 (__v2di)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_epu64 (__m128i __W, __mmask8 __M, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A,
+                                                 (__v2di) __B,
+                                                 (__v2di) __W, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epu64 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A,
+                                                 (__v2di) __B,
+                                                 (__v2di)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_epu64 (__m128i __W, __mmask8 __M, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A,
+                                                 (__v2di) __B,
+                                                 (__v2di) __W, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_epu64 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A,
+                                                 (__v2di) __B,
+                                                 (__v2di)
+                                                 _mm_setzero_si128 (),
+                                                 __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxsd128_mask ((__v4si) __A,
+                                                 (__v4si) __B,
+                                                 (__v4si)
+                                                 _mm_setzero_si128 (),
+                                                 __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_epi32 (__m128i __W, __mmask8 __M, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxsd128_mask ((__v4si) __A,
+                                                 (__v4si) __B,
+                                                 (__v4si) __W, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminsd128_mask ((__v4si) __A,
+                                                 (__v4si) __B,
+                                                 (__v4si)
+                                                 _mm_setzero_si128 (),
+                                                 __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_epi32 (__m128i __W, __mmask8 __M, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminsd128_mask ((__v4si) __A,
+                                                 (__v4si) __B,
+                                                 (__v4si) __W, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_max_epu32 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxud128_mask ((__v4si) __A,
+                                                 (__v4si) __B,
+                                                 (__v4si)
+                                                 _mm_setzero_si128 (),
+                                                 __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_max_epu32 (__m128i __W, __mmask8 __M, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmaxud128_mask ((__v4si) __A,
+                                                 (__v4si) __B,
+                                                 (__v4si) __W, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_min_epu32 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminud128_mask ((__v4si) __A,
+                                                 (__v4si) __B,
+                                                 (__v4si)
+                                                 _mm_setzero_si128 (),
+                                                 __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_min_epu32 (__m128i __W, __mmask8 __M, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pminud128_mask ((__v4si) __A,
+                                                 (__v4si) __B,
+                                                 (__v4si) __W, __M);
+}
+
+#ifndef __AVX512CD__
+#pragma GCC push_options
+#pragma GCC target("avx512vl,avx512cd")
+#define __DISABLE_AVX512VLCD__
+#endif
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcastmb_epi64 (__mmask8 __A)
+{
+  return (__m128i) __builtin_ia32_broadcastmb128 (__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastmb_epi64 (__mmask8 __A)
+{
+  return (__m256i) __builtin_ia32_broadcastmb256 (__A);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcastmw_epi32 (__mmask16 __A)
+{
+  return (__m128i) __builtin_ia32_broadcastmw128 (__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcastmw_epi32 (__mmask16 __A)
+{
+  return (__m256i) __builtin_ia32_broadcastmw256 (__A);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_lzcnt_epi32 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
+                                                    (__v8si)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
+                                                    (__v8si) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A,
+                                                    (__v8si)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_lzcnt_epi64 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
+                                                    (__v4di)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
+                                                    (__v4di) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A,
+                                                    (__v4di)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_conflict_epi64 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
+                                                        (__v4di)
+                                                        _mm256_setzero_si256 (),
+                                                        (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
+                                                        (__v4di) __W,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A,
+                                                        (__v4di)
+                                                        _mm256_setzero_si256 (),
+                                                        (__mmask8)
+                                                        __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_conflict_epi32 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
+                                                        (__v8si)
+                                                        _mm256_setzero_si256 (),
+                                                        (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
+                                                        (__v8si) __W,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A,
+                                                        (__v8si)
+                                                        _mm256_setzero_si256 (),
+                                                        (__mmask8)
+                                                        __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_lzcnt_epi32 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
+                                                    (__v4si)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
+                                                    (__v4si) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A,
+                                                    (__v4si)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_lzcnt_epi64 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
+                                                    (__v2di)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
+                                                    (__v2di) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A,
+                                                    (__v2di)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_conflict_epi64 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
+                                                        (__v2di)
+                                                        _mm_setzero_si128 (),
+                                                        (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
+                                                        (__v2di) __W,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A,
+                                                        (__v2di)
+                                                        _mm_setzero_si128 (),
+                                                        (__mmask8)
+                                                        __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_conflict_epi32 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
+                                                        (__v4si)
+                                                        _mm_setzero_si128 (),
+                                                        (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
+                                                        (__v4si) __W,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A,
+                                                        (__v4si)
+                                                        _mm_setzero_si128 (),
+                                                        (__mmask8)
+                                                        __U);
+}
+
+#ifdef __DISABLE_AVX512VLCD__
+#pragma GCC pop_options
+#endif
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_unpacklo_pd (__m256d __W, __mmask8 __U, __m256d __A,
+                        __m256d __B)
+{
+  return (__m256d) __builtin_ia32_unpcklpd256_mask ((__v4df) __A,
+                                                   (__v4df) __B,
+                                                   (__v4df) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_unpacklo_pd (__mmask8 __U, __m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_unpcklpd256_mask ((__v4df) __A,
+                                                   (__v4df) __B,
+                                                   (__v4df)
+                                                   _mm256_setzero_pd (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_unpacklo_pd (__m128d __W, __mmask8 __U, __m128d __A,
+                     __m128d __B)
+{
+  return (__m128d) __builtin_ia32_unpcklpd128_mask ((__v2df) __A,
+                                                   (__v2df) __B,
+                                                   (__v2df) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_unpacklo_pd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_unpcklpd128_mask ((__v2df) __A,
+                                                   (__v2df) __B,
+                                                   (__v2df)
+                                                   _mm_setzero_pd (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_unpacklo_ps (__m256 __W, __mmask8 __U, __m256 __A,
+                        __m256 __B)
+{
+  return (__m256) __builtin_ia32_unpcklps256_mask ((__v8sf) __A,
+                                                  (__v8sf) __B,
+                                                  (__v8sf) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_unpackhi_pd (__m256d __W, __mmask8 __U, __m256d __A,
+                        __m256d __B)
+{
+  return (__m256d) __builtin_ia32_unpckhpd256_mask ((__v4df) __A,
+                                                   (__v4df) __B,
+                                                   (__v4df) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_unpackhi_pd (__mmask8 __U, __m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_unpckhpd256_mask ((__v4df) __A,
+                                                   (__v4df) __B,
+                                                   (__v4df)
+                                                   _mm256_setzero_pd (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_unpackhi_pd (__m128d __W, __mmask8 __U, __m128d __A,
+                     __m128d __B)
+{
+  return (__m128d) __builtin_ia32_unpckhpd128_mask ((__v2df) __A,
+                                                   (__v2df) __B,
+                                                   (__v2df) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_unpackhi_pd (__mmask8 __U, __m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_unpckhpd128_mask ((__v2df) __A,
+                                                   (__v2df) __B,
+                                                   (__v2df)
+                                                   _mm_setzero_pd (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_unpackhi_ps (__m256 __W, __mmask8 __U, __m256 __A,
+                        __m256 __B)
+{
+  return (__m256) __builtin_ia32_unpckhps256_mask ((__v8sf) __A,
+                                                  (__v8sf) __B,
+                                                  (__v8sf) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_unpackhi_ps (__mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_unpckhps256_mask ((__v8sf) __A,
+                                                  (__v8sf) __B,
+                                                  (__v8sf)
+                                                  _mm256_setzero_ps (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_unpackhi_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_unpckhps128_mask ((__v4sf) __A,
+                                                  (__v4sf) __B,
+                                                  (__v4sf) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_unpackhi_ps (__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_unpckhps128_mask ((__v4sf) __A,
+                                                  (__v4sf) __B,
+                                                  (__v4sf)
+                                                  _mm_setzero_ps (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtph_ps (__m128 __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A,
+                                                (__v4sf) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtph_ps (__mmask8 __U, __m128i __A)
+{
+  return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A,
+                                                (__v4sf)
+                                                _mm_setzero_ps (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_unpacklo_ps (__mmask8 __U, __m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_unpcklps256_mask ((__v8sf) __A,
+                                                  (__v8sf) __B,
+                                                  (__v8sf)
+                                                  _mm256_setzero_ps (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtph_ps (__m256 __W, __mmask8 __U, __m128i __A)
+{
+  return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A,
+                                                   (__v8sf) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtph_ps (__mmask8 __U, __m128i __A)
+{
+  return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A,
+                                                   (__v8sf)
+                                                   _mm256_setzero_ps (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_unpacklo_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_unpcklps128_mask ((__v4sf) __A,
+                                                  (__v4sf) __B,
+                                                  (__v4sf) __W,
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_unpacklo_ps (__mmask8 __U, __m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_unpcklps128_mask ((__v4sf) __A,
+                                                  (__v4sf) __B,
+                                                  (__v4sf)
+                                                  _mm_setzero_ps (),
+                                                  (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sra_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+                      __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A,
+                                                (__v4si) __B,
+                                                (__v8si) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sra_epi32 (__mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A,
+                                                (__v4si) __B,
+                                                (__v8si)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sra_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A,
+                                                (__v4si) __B,
+                                                (__v4si) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sra_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A,
+                                                (__v4si) __B,
+                                                (__v4si)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sra_epi64 (__m256i __A, __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
+                                                (__v2di) __B,
+                                                (__v4di)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sra_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+                      __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
+                                                (__v2di) __B,
+                                                (__v4di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sra_epi64 (__mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A,
+                                                (__v2di) __B,
+                                                (__v4di)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sra_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
+                                                (__v2di) __B,
+                                                (__v2di)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sra_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
+                                                (__v2di) __B,
+                                                (__v2di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sra_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A,
+                                                (__v2di) __B,
+                                                (__v2di)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sll_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A,
+                                                (__v4si) __B,
+                                                (__v4si) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sll_epi32 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A,
+                                                (__v4si) __B,
+                                                (__v4si)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_sll_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+                   __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A,
+                                                (__v2di) __B,
+                                                (__v2di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_sll_epi64 (__mmask8 __U, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A,
+                                                (__v2di) __B,
+                                                (__v2di)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sll_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+                      __m128i __B)
+{
+  return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A,
+                                                (__v4si) __B,
+                                                (__v8si) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sll_epi32 (__mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A,
+                                                (__v4si) __B,
+                                                (__v8si)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_sll_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+                      __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A,
+                                                (__v2di) __B,
+                                                (__v4di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_sll_epi64 (__mmask8 __U, __m256i __A, __m128i __B)
+{
+  return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A,
+                                                (__v2di) __B,
+                                                (__v4di)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutexvar_ps (__m256 __W, __mmask8 __U, __m256i __X,
+                           __m256 __Y)
+{
+  return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y,
+                                                   (__v8si) __X,
+                                                   (__v8sf) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutexvar_ps (__mmask8 __U, __m256i __X, __m256 __Y)
+{
+  return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y,
+                                                   (__v8si) __X,
+                                                   (__v8sf)
+                                                   _mm256_setzero_ps (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutexvar_pd (__m256i __X, __m256d __Y)
+{
+  return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
+                                                    (__v4di) __X,
+                                                    (__v4df)
+                                                    _mm256_setzero_pd (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutexvar_pd (__m256d __W, __mmask8 __U, __m256i __X,
+                           __m256d __Y)
+{
+  return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
+                                                    (__v4di) __X,
+                                                    (__v4df) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutexvar_pd (__mmask8 __U, __m256i __X, __m256d __Y)
+{
+  return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y,
+                                                    (__v4di) __X,
+                                                    (__v4df)
+                                                    _mm256_setzero_pd (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutevar_pd (__m256d __W, __mmask8 __U, __m256d __A,
+                          __m256i __C)
+{
+  return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A,
+                                                       (__v4di) __C,
+                                                       (__v4df) __W,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutevar_pd (__mmask8 __U, __m256d __A, __m256i __C)
+{
+  return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A,
+                                                       (__v4di) __C,
+                                                       (__v4df)
+                                                       _mm256_setzero_pd (),
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutevar_ps (__m256 __W, __mmask8 __U, __m256 __A,
+                          __m256i __C)
+{
+  return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A,
+                                                      (__v8si) __C,
+                                                      (__v8sf) __W,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutevar_ps (__mmask8 __U, __m256 __A, __m256i __C)
+{
+  return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A,
+                                                      (__v8si) __C,
+                                                      (__v8sf)
+                                                      _mm256_setzero_ps (),
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_permutevar_pd (__m128d __W, __mmask8 __U, __m128d __A,
+                       __m128i __C)
+{
+  return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A,
+                                                    (__v2di) __C,
+                                                    (__v2df) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_permutevar_pd (__mmask8 __U, __m128d __A, __m128i __C)
+{
+  return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A,
+                                                    (__v2di) __C,
+                                                    (__v2df)
+                                                    _mm_setzero_pd (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_permutevar_ps (__m128 __W, __mmask8 __U, __m128 __A,
+                       __m128i __C)
+{
+  return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A,
+                                                   (__v4si) __C,
+                                                   (__v4sf) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_permutevar_ps (__mmask8 __U, __m128 __A, __m128i __C)
+{
+  return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A,
+                                                   (__v4si) __C,
+                                                   (__v4sf)
+                                                   _mm_setzero_ps (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mullo_epi32 (__mmask8 __M, __m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A,
+                                                 (__v8si) __B,
+                                                 (__v8si)
+                                                 _mm256_setzero_si256 (),
+                                                 __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutexvar_epi64 (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
+                                                    (__v4di) __X,
+                                                    (__v4di)
+                                                    _mm256_setzero_si256 (),
+                                                    __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mullo_epi32 (__m256i __W, __mmask8 __M, __m256i __A,
+                        __m256i __B)
+{
+  return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A,
+                                                 (__v8si) __B,
+                                                 (__v8si) __W, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mullo_epi32 (__mmask8 __M, __m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A,
+                                                 (__v4si) __B,
+                                                 (__v4si)
+                                                 _mm_setzero_si128 (),
+                                                 __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mullo_epi32 (__m128i __W, __mmask8 __M, __m128i __A,
+                     __m128i __B)
+{
+  return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A,
+                                                 (__v4si) __B,
+                                                 (__v4si) __W, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mul_epi32 (__m256i __W, __mmask8 __M, __m256i __X,
+                      __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X,
+                                                 (__v8si) __Y,
+                                                 (__v4di) __W, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mul_epi32 (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X,
+                                                 (__v8si) __Y,
+                                                 (__v4di)
+                                                 _mm256_setzero_si256 (),
+                                                 __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mul_epi32 (__m128i __W, __mmask8 __M, __m128i __X,
+                   __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X,
+                                                 (__v4si) __Y,
+                                                 (__v2di) __W, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mul_epi32 (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X,
+                                                 (__v4si) __Y,
+                                                 (__v2di)
+                                                 _mm_setzero_si128 (),
+                                                 __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutexvar_epi64 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
+                                                    (__v4di) __X,
+                                                    (__v4di)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutexvar_epi64 (__m256i __W, __mmask8 __M, __m256i __X,
+                              __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y,
+                                                    (__v4di) __X,
+                                                    (__v4di) __W,
+                                                    __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_mul_epu32 (__m256i __W, __mmask8 __M, __m256i __X,
+                      __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X,
+                                                  (__v8si) __Y,
+                                                  (__v4di) __W, __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutexvar_epi32 (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
+                                                    (__v8si) __X,
+                                                    (__v8si)
+                                                    _mm256_setzero_si256 (),
+                                                    __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_mul_epu32 (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X,
+                                                  (__v8si) __Y,
+                                                  (__v4di)
+                                                  _mm256_setzero_si256 (),
+                                                  __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_mul_epu32 (__m128i __W, __mmask8 __M, __m128i __X,
+                   __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X,
+                                                  (__v4si) __Y,
+                                                  (__v2di) __W, __M);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_mul_epu32 (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X,
+                                                  (__v4si) __Y,
+                                                  (__v2di)
+                                                  _mm_setzero_si128 (),
+                                                  __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutexvar_epi32 (__m256i __X, __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
+                                                    (__v8si) __X,
+                                                    (__v8si)
+                                                    _mm256_setzero_si256 (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutexvar_epi32 (__m256i __W, __mmask8 __M, __m256i __X,
+                              __m256i __Y)
+{
+  return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y,
+                                                    (__v8si) __X,
+                                                    (__v8si) __W,
+                                                    __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpneq_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+                                                 (__v8si) __Y, 4,
+                                                 (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpneq_epu32_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+                                                 (__v8si) __Y, 4,
+                                                 (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmplt_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+                                                 (__v8si) __Y, 1,
+                                                 (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmplt_epu32_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+                                                 (__v8si) __Y, 1,
+                                                 (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpge_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+                                                 (__v8si) __Y, 5,
+                                                 (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpge_epu32_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+                                                 (__v8si) __Y, 5,
+                                                 (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmple_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+                                                 (__v8si) __Y, 2,
+                                                 (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmple_epu32_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+                                                 (__v8si) __Y, 2,
+                                                 (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpneq_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+                                                 (__v4di) __Y, 4,
+                                                 (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpneq_epu64_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+                                                 (__v4di) __Y, 4,
+                                                 (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmplt_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+                                                 (__v4di) __Y, 1,
+                                                 (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmplt_epu64_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+                                                 (__v4di) __Y, 1,
+                                                 (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpge_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+                                                 (__v4di) __Y, 5,
+                                                 (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpge_epu64_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+                                                 (__v4di) __Y, 5,
+                                                 (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmple_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+                                                 (__v4di) __Y, 2,
+                                                 (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmple_epu64_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+                                                 (__v4di) __Y, 2,
+                                                 (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpneq_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+                                                (__v8si) __Y, 4,
+                                                (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpneq_epi32_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+                                                (__v8si) __Y, 4,
+                                                (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmplt_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+                                                (__v8si) __Y, 1,
+                                                (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmplt_epi32_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+                                                (__v8si) __Y, 1,
+                                                (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpge_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+                                                (__v8si) __Y, 5,
+                                                (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpge_epi32_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+                                                (__v8si) __Y, 5,
+                                                (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmple_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+                                                (__v8si) __Y, 2,
+                                                (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmple_epi32_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+                                                (__v8si) __Y, 2,
+                                                (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpneq_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+                                                (__v4di) __Y, 4,
+                                                (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpneq_epi64_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+                                                (__v4di) __Y, 4,
+                                                (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmplt_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+                                                (__v4di) __Y, 1,
+                                                (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmplt_epi64_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+                                                (__v4di) __Y, 1,
+                                                (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmpge_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+                                                (__v4di) __Y, 5,
+                                                (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmpge_epi64_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+                                                (__v4di) __Y, 5,
+                                                (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmple_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+                                                (__v4di) __Y, 2,
+                                                (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmple_epi64_mask (__m256i __X, __m256i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+                                                (__v4di) __Y, 2,
+                                                (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpneq_epu32_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
+                                                 (__v4si) __Y, 4,
+                                                 (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_epu32_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
+                                                 (__v4si) __Y, 4,
+                                                 (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmplt_epu32_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
+                                                 (__v4si) __Y, 1,
+                                                 (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epu32_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
+                                                 (__v4si) __Y, 1,
+                                                 (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpge_epu32_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
+                                                 (__v4si) __Y, 5,
+                                                 (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_epu32_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
+                                                 (__v4si) __Y, 5,
+                                                 (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmple_epu32_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
+                                                 (__v4si) __Y, 2,
+                                                 (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_epu32_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
+                                                 (__v4si) __Y, 2,
+                                                 (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpneq_epu64_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
+                                                 (__v2di) __Y, 4,
+                                                 (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_epu64_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
+                                                 (__v2di) __Y, 4,
+                                                 (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmplt_epu64_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
+                                                 (__v2di) __Y, 1,
+                                                 (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epu64_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
+                                                 (__v2di) __Y, 1,
+                                                 (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpge_epu64_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
+                                                 (__v2di) __Y, 5,
+                                                 (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_epu64_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
+                                                 (__v2di) __Y, 5,
+                                                 (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmple_epu64_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
+                                                 (__v2di) __Y, 2,
+                                                 (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_epu64_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
+                                                 (__v2di) __Y, 2,
+                                                 (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpneq_epi32_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
+                                                (__v4si) __Y, 4,
+                                                (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_epi32_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
+                                                (__v4si) __Y, 4,
+                                                (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmplt_epi32_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
+                                                (__v4si) __Y, 1,
+                                                (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epi32_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
+                                                (__v4si) __Y, 1,
+                                                (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpge_epi32_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
+                                                (__v4si) __Y, 5,
+                                                (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_epi32_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
+                                                (__v4si) __Y, 5,
+                                                (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmple_epi32_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
+                                                (__v4si) __Y, 2,
+                                                (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_epi32_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
+                                                (__v4si) __Y, 2,
+                                                (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpneq_epi64_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
+                                                (__v2di) __Y, 4,
+                                                (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_epi64_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
+                                                (__v2di) __Y, 4,
+                                                (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmplt_epi64_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
+                                                (__v2di) __Y, 1,
+                                                (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epi64_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
+                                                (__v2di) __Y, 1,
+                                                (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmpge_epi64_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
+                                                (__v2di) __Y, 5,
+                                                (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_epi64_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
+                                                (__v2di) __Y, 5,
+                                                (__mmask8) -1);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmple_epi64_mask (__mmask8 __M, __m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
+                                                (__v2di) __Y, 2,
+                                                (__mmask8) __M);
+}
+
+extern __inline __mmask8
+  __attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_epi64_mask (__m128i __X, __m128i __Y)
+{
+  return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
+                                                (__v2di) __Y, 2,
+                                                (__mmask8) -1);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutex_epi64 (__m256i __X, const int __I)
+{
+  return (__m256i) __builtin_ia32_permdi256_mask ((__v4di) __X,
+                                             __I,
+                                             (__v4di)
+                                             _mm256_setzero_si256(),
+                                             (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutex_epi64 (__m256i __W, __mmask8 __M,
+                           __m256i __X, const int __I)
+{
+  return (__m256i) __builtin_ia32_permdi256_mask ((__v4di) __X,
+                                                 __I,
+                                                 (__v4di) __W,
+                                                 (__mmask8) __M);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutex_epi64 (__mmask8 __M, __m256i __X, const int __I)
+{
+  return (__m256i) __builtin_ia32_permdi256_mask ((__v4di) __X,
+                                                 __I,
+                                                 (__v4di)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) __M);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shuffle_pd (__m256d __W, __mmask8 __U, __m256d __A,
+                       __m256d __B, const int __imm)
+{
+  return (__m256d) __builtin_ia32_shufpd256_mask ((__v4df) __A,
+                                                 (__v4df) __B, __imm,
+                                                 (__v4df) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shuffle_pd (__mmask8 __U, __m256d __A, __m256d __B,
+                        const int __imm)
+{
+  return (__m256d) __builtin_ia32_shufpd256_mask ((__v4df) __A,
+                                                 (__v4df) __B, __imm,
+                                                 (__v4df)
+                                                 _mm256_setzero_pd (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shuffle_pd (__m128d __W, __mmask8 __U, __m128d __A,
+                    __m128d __B, const int __imm)
+{
+  return (__m128d) __builtin_ia32_shufpd128_mask ((__v2df) __A,
+                                                 (__v2df) __B, __imm,
+                                                 (__v2df) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shuffle_pd (__mmask8 __U, __m128d __A, __m128d __B,
+                     const int __imm)
+{
+  return (__m128d) __builtin_ia32_shufpd128_mask ((__v2df) __A,
+                                                 (__v2df) __B, __imm,
+                                                 (__v2df)
+                                                 _mm_setzero_pd (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shuffle_ps (__m256 __W, __mmask8 __U, __m256 __A,
+                       __m256 __B, const int __imm)
+{
+  return (__m256) __builtin_ia32_shufps256_mask ((__v8sf) __A,
+                                                (__v8sf) __B, __imm,
+                                                (__v8sf) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shuffle_ps (__mmask8 __U, __m256 __A, __m256 __B,
+                        const int __imm)
+{
+  return (__m256) __builtin_ia32_shufps256_mask ((__v8sf) __A,
+                                                (__v8sf) __B, __imm,
+                                                (__v8sf)
+                                                _mm256_setzero_ps (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shuffle_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B,
+                    const int __imm)
+{
+  return (__m128) __builtin_ia32_shufps128_mask ((__v4sf) __A,
+                                                (__v4sf) __B, __imm,
+                                                (__v4sf) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shuffle_ps (__mmask8 __U, __m128 __A, __m128 __B,
+                     const int __imm)
+{
+  return (__m128) __builtin_ia32_shufps128_mask ((__v4sf) __A,
+                                                (__v4sf) __B, __imm,
+                                                (__v4sf)
+                                                _mm_setzero_ps (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_inserti32x4 (__m256i __A, __m128i __B, const int __imm)
+{
+  return (__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si) __A,
+                                                       (__v4si) __B,
+                                                       __imm,
+                                                       (__v8si)
+                                                       _mm256_setzero_si256 (),
+                                                       (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_inserti32x4 (__m256i __W, __mmask8 __U, __m256i __A,
+                        __m128i __B, const int __imm)
+{
+  return (__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si) __A,
+                                                       (__v4si) __B,
+                                                       __imm,
+                                                       (__v8si) __W,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_inserti32x4 (__mmask8 __U, __m256i __A, __m128i __B,
+                         const int __imm)
+{
+  return (__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si) __A,
+                                                       (__v4si) __B,
+                                                       __imm,
+                                                       (__v8si)
+                                                       _mm256_setzero_si256 (),
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_insertf32x4 (__m256 __A, __m128 __B, const int __imm)
+{
+  return (__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf) __A,
+                                                      (__v4sf) __B,
+                                                      __imm,
+                                                      (__v8sf)
+                                                      _mm256_setzero_ps (),
+                                                      (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_insertf32x4 (__m256 __W, __mmask8 __U, __m256 __A,
+                        __m128 __B, const int __imm)
+{
+  return (__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf) __A,
+                                                      (__v4sf) __B,
+                                                      __imm,
+                                                      (__v8sf) __W,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_insertf32x4 (__mmask8 __U, __m256 __A, __m128 __B,
+                         const int __imm)
+{
+  return (__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf) __A,
+                                                      (__v4sf) __B,
+                                                      __imm,
+                                                      (__v8sf)
+                                                      _mm256_setzero_ps (),
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extracti32x4_epi32 (__m256i __A, const int __imm)
+{
+  return (__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si) __A,
+                                                        __imm,
+                                                        (__v4si)
+                                                        _mm_setzero_si128 (),
+                                                        (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_extracti32x4_epi32 (__m128i __W, __mmask8 __U, __m256i __A,
+                               const int __imm)
+{
+  return (__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si) __A,
+                                                        __imm,
+                                                        (__v4si) __W,
+                                                        (__mmask8)
+                                                        __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_extracti32x4_epi32 (__mmask8 __U, __m256i __A,
+                                const int __imm)
+{
+  return (__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si) __A,
+                                                        __imm,
+                                                        (__v4si)
+                                                        _mm_setzero_si128 (),
+                                                        (__mmask8)
+                                                        __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extractf32x4_ps (__m256 __A, const int __imm)
+{
+  return (__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf) __A,
+                                                       __imm,
+                                                       (__v4sf)
+                                                       _mm_setzero_ps (),
+                                                       (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_extractf32x4_ps (__m128 __W, __mmask8 __U, __m256 __A,
+                            const int __imm)
+{
+  return (__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf) __A,
+                                                       __imm,
+                                                       (__v4sf) __W,
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_extractf32x4_ps (__mmask8 __U, __m256 __A,
+                             const int __imm)
+{
+  return (__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf) __A,
+                                                       __imm,
+                                                       (__v4sf)
+                                                       _mm_setzero_ps (),
+                                                       (__mmask8)
+                                                       __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shuffle_i64x2 (__m256i __A, __m256i __B, const int __imm)
+{
+  return (__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di) __A,
+                                                      (__v4di) __B,
+                                                      __imm,
+                                                      (__v4di)
+                                                      _mm256_setzero_si256 (),
+                                                      (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shuffle_i64x2 (__m256i __W, __mmask8 __U, __m256i __A,
+                          __m256i __B, const int __imm)
+{
+  return (__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di) __A,
+                                                      (__v4di) __B,
+                                                      __imm,
+                                                      (__v4di) __W,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shuffle_i64x2 (__mmask8 __U, __m256i __A, __m256i __B,
+                           const int __imm)
+{
+  return (__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di) __A,
+                                                      (__v4di) __B,
+                                                      __imm,
+                                                      (__v4di)
+                                                      _mm256_setzero_si256 (),
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shuffle_i32x4 (__m256i __A, __m256i __B, const int __imm)
+{
+  return (__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si) __A,
+                                                      (__v8si) __B,
+                                                      __imm,
+                                                      (__v8si)
+                                                      _mm256_setzero_si256 (),
+                                                      (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shuffle_i32x4 (__m256i __W, __mmask8 __U, __m256i __A,
+                          __m256i __B, const int __imm)
+{
+  return (__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si) __A,
+                                                      (__v8si) __B,
+                                                      __imm,
+                                                      (__v8si) __W,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shuffle_i32x4 (__mmask8 __U, __m256i __A, __m256i __B,
+                           const int __imm)
+{
+  return (__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si) __A,
+                                                      (__v8si) __B,
+                                                      __imm,
+                                                      (__v8si)
+                                                      _mm256_setzero_si256 (),
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shuffle_f64x2 (__m256d __A, __m256d __B, const int __imm)
+{
+  return (__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df) __A,
+                                                      (__v4df) __B,
+                                                      __imm,
+                                                      (__v4df)
+                                                      _mm256_setzero_pd (),
+                                                      (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shuffle_f64x2 (__m256d __W, __mmask8 __U, __m256d __A,
+                          __m256d __B, const int __imm)
+{
+  return (__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df) __A,
+                                                      (__v4df) __B,
+                                                      __imm,
+                                                      (__v4df) __W,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shuffle_f64x2 (__mmask8 __U, __m256d __A, __m256d __B,
+                           const int __imm)
+{
+  return (__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df) __A,
+                                                      (__v4df) __B,
+                                                      __imm,
+                                                      (__v4df)
+                                                      _mm256_setzero_pd (),
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shuffle_f32x4 (__m256 __A, __m256 __B, const int __imm)
+{
+  return (__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf) __A,
+                                                     (__v8sf) __B,
+                                                     __imm,
+                                                     (__v8sf)
+                                                     _mm256_setzero_ps (),
+                                                     (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shuffle_f32x4 (__m256 __W, __mmask8 __U, __m256 __A,
+                          __m256 __B, const int __imm)
+{
+  return (__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf) __A,
+                                                     (__v8sf) __B,
+                                                     __imm,
+                                                     (__v8sf) __W,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shuffle_f32x4 (__mmask8 __U, __m256 __A, __m256 __B,
+                           const int __imm)
+{
+  return (__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf) __A,
+                                                     (__v8sf) __B,
+                                                     __imm,
+                                                     (__v8sf)
+                                                     _mm256_setzero_ps (),
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fixupimm_pd (__m256d __A, __m256d __B, __m256i __C,
+                   const int __imm)
+{
+  return (__m256d) __builtin_ia32_fixupimmpd256_mask ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4di) __C,
+                                                     __imm,
+                                                     (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fixupimm_pd (__m256d __A, __mmask8 __U, __m256d __B,
+                        __m256i __C, const int __imm)
+{
+  return (__m256d) __builtin_ia32_fixupimmpd256_mask ((__v4df) __A,
+                                                     (__v4df) __B,
+                                                     (__v4di) __C,
+                                                     __imm,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fixupimm_pd (__mmask8 __U, __m256d __A, __m256d __B,
+                         __m256i __C, const int __imm)
+{
+  return (__m256d) __builtin_ia32_fixupimmpd256_maskz ((__v4df) __A,
+                                                      (__v4df) __B,
+                                                      (__v4di) __C,
+                                                      __imm,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fixupimm_ps (__m256 __A, __m256 __B, __m256i __C,
+                   const int __imm)
+{
+  return (__m256) __builtin_ia32_fixupimmps256_mask ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8si) __C,
+                                                    __imm,
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_fixupimm_ps (__m256 __A, __mmask8 __U, __m256 __B,
+                        __m256i __C, const int __imm)
+{
+  return (__m256) __builtin_ia32_fixupimmps256_mask ((__v8sf) __A,
+                                                    (__v8sf) __B,
+                                                    (__v8si) __C,
+                                                    __imm,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_fixupimm_ps (__mmask8 __U, __m256 __A, __m256 __B,
+                         __m256i __C, const int __imm)
+{
+  return (__m256) __builtin_ia32_fixupimmps256_maskz ((__v8sf) __A,
+                                                     (__v8sf) __B,
+                                                     (__v8si) __C,
+                                                     __imm,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fixupimm_pd (__m128d __A, __m128d __B, __m128i __C,
+                const int __imm)
+{
+  return (__m128d) __builtin_ia32_fixupimmpd128_mask ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2di) __C,
+                                                     __imm,
+                                                     (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fixupimm_pd (__m128d __A, __mmask8 __U, __m128d __B,
+                     __m128i __C, const int __imm)
+{
+  return (__m128d) __builtin_ia32_fixupimmpd128_mask ((__v2df) __A,
+                                                     (__v2df) __B,
+                                                     (__v2di) __C,
+                                                     __imm,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fixupimm_pd (__mmask8 __U, __m128d __A, __m128d __B,
+                      __m128i __C, const int __imm)
+{
+  return (__m128d) __builtin_ia32_fixupimmpd128_maskz ((__v2df) __A,
+                                                      (__v2df) __B,
+                                                      (__v2di) __C,
+                                                      __imm,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fixupimm_ps (__m128 __A, __m128 __B, __m128i __C, const int __imm)
+{
+  return (__m128) __builtin_ia32_fixupimmps128_mask ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4si) __C,
+                                                    __imm,
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_fixupimm_ps (__m128 __A, __mmask8 __U, __m128 __B,
+                     __m128i __C, const int __imm)
+{
+  return (__m128) __builtin_ia32_fixupimmps128_mask ((__v4sf) __A,
+                                                    (__v4sf) __B,
+                                                    (__v4si) __C,
+                                                    __imm,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_fixupimm_ps (__mmask8 __U, __m128 __A, __m128 __B,
+                      __m128i __C, const int __imm)
+{
+  return (__m128) __builtin_ia32_fixupimmps128_maskz ((__v4sf) __A,
+                                                     (__v4sf) __B,
+                                                     (__v4si) __C,
+                                                     __imm,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_srli_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+                       const int __imm)
+{
+  return (__m256i) __builtin_ia32_psrldi256_mask ((__v8si) __A, __imm,
+                                                 (__v8si) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_srli_epi32 (__mmask8 __U, __m256i __A, const int __imm)
+{
+  return (__m256i) __builtin_ia32_psrldi256_mask ((__v8si) __A, __imm,
+                                                 (__v8si)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_srli_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+                    const int __imm)
+{
+  return (__m128i) __builtin_ia32_psrldi128_mask ((__v4si) __A, __imm,
+                                                 (__v4si) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_srli_epi32 (__mmask8 __U, __m128i __A, const int __imm)
+{
+  return (__m128i) __builtin_ia32_psrldi128_mask ((__v4si) __A, __imm,
+                                                 (__v4si)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_srli_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+                       const int __imm)
+{
+  return (__m256i) __builtin_ia32_psrlqi256_mask ((__v4di) __A, __imm,
+                                                 (__v4di) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_srli_epi64 (__mmask8 __U, __m256i __A, const int __imm)
+{
+  return (__m256i) __builtin_ia32_psrlqi256_mask ((__v4di) __A, __imm,
+                                                 (__v4di)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_srli_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+                    const int __imm)
+{
+  return (__m128i) __builtin_ia32_psrlqi128_mask ((__v2di) __A, __imm,
+                                                 (__v2di) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_srli_epi64 (__mmask8 __U, __m128i __A, const int __imm)
+{
+  return (__m128i) __builtin_ia32_psrlqi128_mask ((__v2di) __A, __imm,
+                                                 (__v2di)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_ternarylogic_epi64 (__m256i __A, __m256i __B, __m256i __C,
+                          const int __imm)
+{
+  return (__m256i)
+    __builtin_ia32_pternlogq256_mask ((__v4di) __A,
+                                     (__v4di) __B,
+                                     (__v4di) __C,
+                                     (unsigned char) __imm,
+                                     (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_ternarylogic_epi64 (__m256i __A, __mmask8 __U,
+                               __m256i __B, __m256i __C,
+                               const int __imm)
+{
+  return (__m256i)
+    __builtin_ia32_pternlogq256_mask ((__v4di) __A,
+                                     (__v4di) __B,
+                                     (__v4di) __C,
+                                     (unsigned char) __imm,
+                                     (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_ternarylogic_epi64 (__mmask8 __U, __m256i __A,
+                                __m256i __B, __m256i __C,
+                                const int __imm)
+{
+  return (__m256i)
+    __builtin_ia32_pternlogq256_maskz ((__v4di) __A,
+                                      (__v4di) __B,
+                                      (__v4di) __C,
+                                      (unsigned char) __imm,
+                                      (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_ternarylogic_epi32 (__m256i __A, __m256i __B, __m256i __C,
+                          const int __imm)
+{
+  return (__m256i)
+    __builtin_ia32_pternlogd256_mask ((__v8si) __A,
+                                     (__v8si) __B,
+                                     (__v8si) __C,
+                                     (unsigned char) __imm,
+                                     (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_ternarylogic_epi32 (__m256i __A, __mmask8 __U,
+                               __m256i __B, __m256i __C,
+                               const int __imm)
+{
+  return (__m256i)
+    __builtin_ia32_pternlogd256_mask ((__v8si) __A,
+                                     (__v8si) __B,
+                                     (__v8si) __C,
+                                     (unsigned char) __imm,
+                                     (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_ternarylogic_epi32 (__mmask8 __U, __m256i __A,
+                                __m256i __B, __m256i __C,
+                                const int __imm)
+{
+  return (__m256i)
+    __builtin_ia32_pternlogd256_maskz ((__v8si) __A,
+                                      (__v8si) __B,
+                                      (__v8si) __C,
+                                      (unsigned char) __imm,
+                                      (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ternarylogic_epi64 (__m128i __A, __m128i __B, __m128i __C,
+                       const int __imm)
+{
+  return (__m128i)
+    __builtin_ia32_pternlogq128_mask ((__v2di) __A,
+                                     (__v2di) __B,
+                                     (__v2di) __C,
+                                     (unsigned char) __imm,
+                                     (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_ternarylogic_epi64 (__m128i __A, __mmask8 __U,
+                            __m128i __B, __m128i __C,
+                            const int __imm)
+{
+  return (__m128i)
+    __builtin_ia32_pternlogq128_mask ((__v2di) __A,
+                                     (__v2di) __B,
+                                     (__v2di) __C,
+                                     (unsigned char) __imm,
+                                     (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_ternarylogic_epi64 (__mmask8 __U, __m128i __A,
+                             __m128i __B, __m128i __C,
+                             const int __imm)
+{
+  return (__m128i)
+    __builtin_ia32_pternlogq128_maskz ((__v2di) __A,
+                                      (__v2di) __B,
+                                      (__v2di) __C,
+                                      (unsigned char) __imm,
+                                      (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ternarylogic_epi32 (__m128i __A, __m128i __B, __m128i __C,
+                       const int __imm)
+{
+  return (__m128i)
+    __builtin_ia32_pternlogd128_mask ((__v4si) __A,
+                                     (__v4si) __B,
+                                     (__v4si) __C,
+                                     (unsigned char) __imm,
+                                     (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_ternarylogic_epi32 (__m128i __A, __mmask8 __U,
+                            __m128i __B, __m128i __C,
+                            const int __imm)
+{
+  return (__m128i)
+    __builtin_ia32_pternlogd128_mask ((__v4si) __A,
+                                     (__v4si) __B,
+                                     (__v4si) __C,
+                                     (unsigned char) __imm,
+                                     (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_ternarylogic_epi32 (__mmask8 __U, __m128i __A,
+                             __m128i __B, __m128i __C,
+                             const int __imm)
+{
+  return (__m128i)
+    __builtin_ia32_pternlogd128_maskz ((__v4si) __A,
+                                      (__v4si) __B,
+                                      (__v4si) __C,
+                                      (unsigned char) __imm,
+                                      (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_roundscale_ps (__m256 __A, const int __imm)
+{
+  return (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A,
+                                                     __imm,
+                                                     (__v8sf)
+                                                     _mm256_setzero_ps (),
+                                                     (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_roundscale_ps (__m256 __W, __mmask8 __U, __m256 __A,
+                          const int __imm)
+{
+  return (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A,
+                                                     __imm,
+                                                     (__v8sf) __W,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_roundscale_ps (__mmask8 __U, __m256 __A, const int __imm)
+{
+  return (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A,
+                                                     __imm,
+                                                     (__v8sf)
+                                                     _mm256_setzero_ps (),
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_roundscale_pd (__m256d __A, const int __imm)
+{
+  return (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A,
+                                                      __imm,
+                                                      (__v4df)
+                                                      _mm256_setzero_pd (),
+                                                      (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_roundscale_pd (__m256d __W, __mmask8 __U, __m256d __A,
+                          const int __imm)
+{
+  return (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A,
+                                                      __imm,
+                                                      (__v4df) __W,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_roundscale_pd (__mmask8 __U, __m256d __A, const int __imm)
+{
+  return (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A,
+                                                      __imm,
+                                                      (__v4df)
+                                                      _mm256_setzero_pd (),
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roundscale_ps (__m128 __A, const int __imm)
+{
+  return (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A,
+                                                     __imm,
+                                                     (__v4sf)
+                                                     _mm_setzero_ps (),
+                                                     (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_roundscale_ps (__m128 __W, __mmask8 __U, __m128 __A,
+                       const int __imm)
+{
+  return (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A,
+                                                     __imm,
+                                                     (__v4sf) __W,
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_roundscale_ps (__mmask8 __U, __m128 __A, const int __imm)
+{
+  return (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A,
+                                                     __imm,
+                                                     (__v4sf)
+                                                     _mm_setzero_ps (),
+                                                     (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roundscale_pd (__m128d __A, const int __imm)
+{
+  return (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A,
+                                                      __imm,
+                                                      (__v2df)
+                                                      _mm_setzero_pd (),
+                                                      (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_roundscale_pd (__m128d __W, __mmask8 __U, __m128d __A,
+                       const int __imm)
+{
+  return (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A,
+                                                      __imm,
+                                                      (__v2df) __W,
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_roundscale_pd (__mmask8 __U, __m128d __A, const int __imm)
+{
+  return (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A,
+                                                      __imm,
+                                                      (__v2df)
+                                                      _mm_setzero_pd (),
+                                                      (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_getmant_ps (__m256 __A, _MM_MANTISSA_NORM_ENUM __B,
+                  _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m256) __builtin_ia32_getmantps256_mask ((__v8sf) __A,
+                                                   (__C << 2) | __B,
+                                                   (__v8sf)
+                                                   _mm256_setzero_ps (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_getmant_ps (__m256 __W, __mmask8 __U, __m256 __A,
+                       _MM_MANTISSA_NORM_ENUM __B,
+                       _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m256) __builtin_ia32_getmantps256_mask ((__v8sf) __A,
+                                                   (__C << 2) | __B,
+                                                   (__v8sf) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_getmant_ps (__mmask8 __U, __m256 __A,
+                        _MM_MANTISSA_NORM_ENUM __B,
+                        _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m256) __builtin_ia32_getmantps256_mask ((__v8sf) __A,
+                                                   (__C << 2) | __B,
+                                                   (__v8sf)
+                                                   _mm256_setzero_ps (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getmant_ps (__m128 __A, _MM_MANTISSA_NORM_ENUM __B,
+               _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m128) __builtin_ia32_getmantps128_mask ((__v4sf) __A,
+                                                   (__C << 2) | __B,
+                                                   (__v4sf)
+                                                   _mm_setzero_ps (),
+                                                   (__mmask8) -1);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getmant_ps (__m128 __W, __mmask8 __U, __m128 __A,
+                    _MM_MANTISSA_NORM_ENUM __B,
+                    _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m128) __builtin_ia32_getmantps128_mask ((__v4sf) __A,
+                                                   (__C << 2) | __B,
+                                                   (__v4sf) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getmant_ps (__mmask8 __U, __m128 __A,
+                     _MM_MANTISSA_NORM_ENUM __B,
+                     _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m128) __builtin_ia32_getmantps128_mask ((__v4sf) __A,
+                                                   (__C << 2) | __B,
+                                                   (__v4sf)
+                                                   _mm_setzero_ps (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_getmant_pd (__m256d __A, _MM_MANTISSA_NORM_ENUM __B,
+                  _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m256d) __builtin_ia32_getmantpd256_mask ((__v4df) __A,
+                                                    (__C << 2) | __B,
+                                                    (__v4df)
+                                                    _mm256_setzero_pd (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_getmant_pd (__m256d __W, __mmask8 __U, __m256d __A,
+                       _MM_MANTISSA_NORM_ENUM __B,
+                       _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m256d) __builtin_ia32_getmantpd256_mask ((__v4df) __A,
+                                                    (__C << 2) | __B,
+                                                    (__v4df) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_getmant_pd (__mmask8 __U, __m256d __A,
+                        _MM_MANTISSA_NORM_ENUM __B,
+                        _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m256d) __builtin_ia32_getmantpd256_mask ((__v4df) __A,
+                                                    (__C << 2) | __B,
+                                                    (__v4df)
+                                                    _mm256_setzero_pd (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getmant_pd (__m128d __A, _MM_MANTISSA_NORM_ENUM __B,
+               _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m128d) __builtin_ia32_getmantpd128_mask ((__v2df) __A,
+                                                    (__C << 2) | __B,
+                                                    (__v2df)
+                                                    _mm_setzero_pd (),
+                                                    (__mmask8) -1);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_getmant_pd (__m128d __W, __mmask8 __U, __m128d __A,
+                    _MM_MANTISSA_NORM_ENUM __B,
+                    _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m128d) __builtin_ia32_getmantpd128_mask ((__v2df) __A,
+                                                    (__C << 2) | __B,
+                                                    (__v2df) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_getmant_pd (__mmask8 __U, __m128d __A,
+                     _MM_MANTISSA_NORM_ENUM __B,
+                     _MM_MANTISSA_SIGN_ENUM __C)
+{
+  return (__m128d) __builtin_ia32_getmantpd128_mask ((__v2df) __A,
+                                                    (__C << 2) | __B,
+                                                    (__v2df)
+                                                    _mm_setzero_pd (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mmask_i32gather_ps (__m256 __v1_old, __mmask8 __mask,
+                          __m256i __index, void const *__addr,
+                          int __scale)
+{
+  return (__m256) __builtin_ia32_gather3siv8sf ((__v8sf) __v1_old,
+                                               __addr,
+                                               (__v8si) __index,
+                                               __mask, __scale);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mmask_i32gather_ps (__m128 __v1_old, __mmask8 __mask,
+                       __m128i __index, void const *__addr,
+                       int __scale)
+{
+  return (__m128) __builtin_ia32_gather3siv4sf ((__v4sf) __v1_old,
+                                               __addr,
+                                               (__v4si) __index,
+                                               __mask, __scale);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mmask_i32gather_pd (__m256d __v1_old, __mmask8 __mask,
+                          __m128i __index, void const *__addr,
+                          int __scale)
+{
+  return (__m256d) __builtin_ia32_gather3siv4df ((__v4df) __v1_old,
+                                                __addr,
+                                                (__v4si) __index,
+                                                __mask, __scale);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mmask_i32gather_pd (__m128d __v1_old, __mmask8 __mask,
+                       __m128i __index, void const *__addr,
+                       int __scale)
+{
+  return (__m128d) __builtin_ia32_gather3siv2df ((__v2df) __v1_old,
+                                                __addr,
+                                                (__v4si) __index,
+                                                __mask, __scale);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mmask_i64gather_ps (__m128 __v1_old, __mmask8 __mask,
+                          __m256i __index, void const *__addr,
+                          int __scale)
+{
+  return (__m128) __builtin_ia32_gather3div8sf ((__v4sf) __v1_old,
+                                               __addr,
+                                               (__v4di) __index,
+                                               __mask, __scale);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mmask_i64gather_ps (__m128 __v1_old, __mmask8 __mask,
+                       __m128i __index, void const *__addr,
+                       int __scale)
+{
+  return (__m128) __builtin_ia32_gather3div4sf ((__v4sf) __v1_old,
+                                               __addr,
+                                               (__v2di) __index,
+                                               __mask, __scale);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mmask_i64gather_pd (__m256d __v1_old, __mmask8 __mask,
+                          __m256i __index, void const *__addr,
+                          int __scale)
+{
+  return (__m256d) __builtin_ia32_gather3div4df ((__v4df) __v1_old,
+                                                __addr,
+                                                (__v4di) __index,
+                                                __mask, __scale);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mmask_i64gather_pd (__m128d __v1_old, __mmask8 __mask,
+                       __m128i __index, void const *__addr,
+                       int __scale)
+{
+  return (__m128d) __builtin_ia32_gather3div2df ((__v2df) __v1_old,
+                                                __addr,
+                                                (__v2di) __index,
+                                                __mask, __scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mmask_i32gather_epi32 (__m256i __v1_old, __mmask8 __mask,
+                             __m256i __index, void const *__addr,
+                             int __scale)
+{
+  return (__m256i) __builtin_ia32_gather3siv8si ((__v8si) __v1_old,
+                                                __addr,
+                                                (__v8si) __index,
+                                                __mask, __scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mmask_i32gather_epi32 (__m128i __v1_old, __mmask8 __mask,
+                          __m128i __index, void const *__addr,
+                          int __scale)
+{
+  return (__m128i) __builtin_ia32_gather3siv4si ((__v4si) __v1_old,
+                                                __addr,
+                                                (__v4si) __index,
+                                                __mask, __scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mmask_i32gather_epi64 (__m256i __v1_old, __mmask8 __mask,
+                             __m128i __index, void const *__addr,
+                             int __scale)
+{
+  return (__m256i) __builtin_ia32_gather3siv4di ((__v4di) __v1_old,
+                                                __addr,
+                                                (__v4si) __index,
+                                                __mask, __scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mmask_i32gather_epi64 (__m128i __v1_old, __mmask8 __mask,
+                          __m128i __index, void const *__addr,
+                          int __scale)
+{
+  return (__m128i) __builtin_ia32_gather3siv2di ((__v2di) __v1_old,
+                                                __addr,
+                                                (__v4si) __index,
+                                                __mask, __scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mmask_i64gather_epi32 (__m128i __v1_old, __mmask8 __mask,
+                             __m256i __index, void const *__addr,
+                             int __scale)
+{
+  return (__m128i) __builtin_ia32_gather3div8si ((__v4si) __v1_old,
+                                                __addr,
+                                                (__v4di) __index,
+                                                __mask, __scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mmask_i64gather_epi32 (__m128i __v1_old, __mmask8 __mask,
+                          __m128i __index, void const *__addr,
+                          int __scale)
+{
+  return (__m128i) __builtin_ia32_gather3div4si ((__v4si) __v1_old,
+                                                __addr,
+                                                (__v2di) __index,
+                                                __mask, __scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mmask_i64gather_epi64 (__m256i __v1_old, __mmask8 __mask,
+                             __m256i __index, void const *__addr,
+                             int __scale)
+{
+  return (__m256i) __builtin_ia32_gather3div4di ((__v4di) __v1_old,
+                                                __addr,
+                                                (__v4di) __index,
+                                                __mask, __scale);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mmask_i64gather_epi64 (__m128i __v1_old, __mmask8 __mask,
+                          __m128i __index, void const *__addr,
+                          int __scale)
+{
+  return (__m128i) __builtin_ia32_gather3div2di ((__v2di) __v1_old,
+                                                __addr,
+                                                (__v2di) __index,
+                                                __mask, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i32scatter_ps (void *__addr, __m256i __index,
+                     __m256 __v1, const int __scale)
+{
+  __builtin_ia32_scattersiv8sf (__addr, (__mmask8) 0xFF,
+                               (__v8si) __index, (__v8sf) __v1,
+                               __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i32scatter_ps (void *__addr, __mmask8 __mask,
+                          __m256i __index, __m256 __v1,
+                          const int __scale)
+{
+  __builtin_ia32_scattersiv8sf (__addr, __mask, (__v8si) __index,
+                               (__v8sf) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i32scatter_ps (void *__addr, __m128i __index, __m128 __v1,
+                  const int __scale)
+{
+  __builtin_ia32_scattersiv4sf (__addr, (__mmask8) 0xFF,
+                               (__v4si) __index, (__v4sf) __v1,
+                               __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i32scatter_ps (void *__addr, __mmask8 __mask,
+                       __m128i __index, __m128 __v1,
+                       const int __scale)
+{
+  __builtin_ia32_scattersiv4sf (__addr, __mask, (__v4si) __index,
+                               (__v4sf) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i32scatter_pd (void *__addr, __m128i __index,
+                     __m256d __v1, const int __scale)
+{
+  __builtin_ia32_scattersiv4df (__addr, (__mmask8) 0xFF,
+                               (__v4si) __index, (__v4df) __v1,
+                               __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i32scatter_pd (void *__addr, __mmask8 __mask,
+                          __m128i __index, __m256d __v1,
+                          const int __scale)
+{
+  __builtin_ia32_scattersiv4df (__addr, __mask, (__v4si) __index,
+                               (__v4df) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i32scatter_pd (void *__addr, __m128i __index,
+                  __m128d __v1, const int __scale)
+{
+  __builtin_ia32_scattersiv2df (__addr, (__mmask8) 0xFF,
+                               (__v4si) __index, (__v2df) __v1,
+                               __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i32scatter_pd (void *__addr, __mmask8 __mask,
+                       __m128i __index, __m128d __v1,
+                       const int __scale)
+{
+  __builtin_ia32_scattersiv2df (__addr, __mask, (__v4si) __index,
+                               (__v2df) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i64scatter_ps (void *__addr, __m256i __index,
+                     __m128 __v1, const int __scale)
+{
+  __builtin_ia32_scatterdiv8sf (__addr, (__mmask8) 0xFF,
+                               (__v4di) __index, (__v4sf) __v1,
+                               __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i64scatter_ps (void *__addr, __mmask8 __mask,
+                          __m256i __index, __m128 __v1,
+                          const int __scale)
+{
+  __builtin_ia32_scatterdiv8sf (__addr, __mask, (__v4di) __index,
+                               (__v4sf) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i64scatter_ps (void *__addr, __m128i __index, __m128 __v1,
+                  const int __scale)
+{
+  __builtin_ia32_scatterdiv4sf (__addr, (__mmask8) 0xFF,
+                               (__v2di) __index, (__v4sf) __v1,
+                               __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i64scatter_ps (void *__addr, __mmask8 __mask,
+                       __m128i __index, __m128 __v1,
+                       const int __scale)
+{
+  __builtin_ia32_scatterdiv4sf (__addr, __mask, (__v2di) __index,
+                               (__v4sf) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i64scatter_pd (void *__addr, __m256i __index,
+                     __m256d __v1, const int __scale)
+{
+  __builtin_ia32_scatterdiv4df (__addr, (__mmask8) 0xFF,
+                               (__v4di) __index, (__v4df) __v1,
+                               __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i64scatter_pd (void *__addr, __mmask8 __mask,
+                          __m256i __index, __m256d __v1,
+                          const int __scale)
+{
+  __builtin_ia32_scatterdiv4df (__addr, __mask, (__v4di) __index,
+                               (__v4df) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i64scatter_pd (void *__addr, __m128i __index,
+                  __m128d __v1, const int __scale)
+{
+  __builtin_ia32_scatterdiv2df (__addr, (__mmask8) 0xFF,
+                               (__v2di) __index, (__v2df) __v1,
+                               __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i64scatter_pd (void *__addr, __mmask8 __mask,
+                       __m128i __index, __m128d __v1,
+                       const int __scale)
+{
+  __builtin_ia32_scatterdiv2df (__addr, __mask, (__v2di) __index,
+                               (__v2df) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i32scatter_epi32 (void *__addr, __m256i __index,
+                        __m256i __v1, const int __scale)
+{
+  __builtin_ia32_scattersiv8si (__addr, (__mmask8) 0xFF,
+                               (__v8si) __index, (__v8si) __v1,
+                               __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i32scatter_epi32 (void *__addr, __mmask8 __mask,
+                             __m256i __index, __m256i __v1,
+                             const int __scale)
+{
+  __builtin_ia32_scattersiv8si (__addr, __mask, (__v8si) __index,
+                               (__v8si) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i32scatter_epi32 (void *__addr, __m128i __index,
+                     __m128i __v1, const int __scale)
+{
+  __builtin_ia32_scattersiv4si (__addr, (__mmask8) 0xFF,
+                               (__v4si) __index, (__v4si) __v1,
+                               __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i32scatter_epi32 (void *__addr, __mmask8 __mask,
+                          __m128i __index, __m128i __v1,
+                          const int __scale)
+{
+  __builtin_ia32_scattersiv4si (__addr, __mask, (__v4si) __index,
+                               (__v4si) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i32scatter_epi64 (void *__addr, __m128i __index,
+                        __m256i __v1, const int __scale)
+{
+  __builtin_ia32_scattersiv4di (__addr, (__mmask8) 0xFF,
+                               (__v4si) __index, (__v4di) __v1,
+                               __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i32scatter_epi64 (void *__addr, __mmask8 __mask,
+                             __m128i __index, __m256i __v1,
+                             const int __scale)
+{
+  __builtin_ia32_scattersiv4di (__addr, __mask, (__v4si) __index,
+                               (__v4di) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i32scatter_epi64 (void *__addr, __m128i __index,
+                     __m128i __v1, const int __scale)
+{
+  __builtin_ia32_scattersiv2di (__addr, (__mmask8) 0xFF,
+                               (__v4si) __index, (__v2di) __v1,
+                               __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i32scatter_epi64 (void *__addr, __mmask8 __mask,
+                          __m128i __index, __m128i __v1,
+                          const int __scale)
+{
+  __builtin_ia32_scattersiv2di (__addr, __mask, (__v4si) __index,
+                               (__v2di) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i64scatter_epi32 (void *__addr, __m256i __index,
+                        __m128i __v1, const int __scale)
+{
+  __builtin_ia32_scatterdiv8si (__addr, (__mmask8) 0xFF,
+                               (__v4di) __index, (__v4si) __v1,
+                               __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i64scatter_epi32 (void *__addr, __mmask8 __mask,
+                             __m256i __index, __m128i __v1,
+                             const int __scale)
+{
+  __builtin_ia32_scatterdiv8si (__addr, __mask, (__v4di) __index,
+                               (__v4si) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i64scatter_epi32 (void *__addr, __m128i __index,
+                     __m128i __v1, const int __scale)
+{
+  __builtin_ia32_scatterdiv4si (__addr, (__mmask8) 0xFF,
+                               (__v2di) __index, (__v4si) __v1,
+                               __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i64scatter_epi32 (void *__addr, __mmask8 __mask,
+                          __m128i __index, __m128i __v1,
+                          const int __scale)
+{
+  __builtin_ia32_scatterdiv4si (__addr, __mask, (__v2di) __index,
+                               (__v4si) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_i64scatter_epi64 (void *__addr, __m256i __index,
+                        __m256i __v1, const int __scale)
+{
+  __builtin_ia32_scatterdiv4di (__addr, (__mmask8) 0xFF,
+                               (__v4di) __index, (__v4di) __v1,
+                               __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_i64scatter_epi64 (void *__addr, __mmask8 __mask,
+                             __m256i __index, __m256i __v1,
+                             const int __scale)
+{
+  __builtin_ia32_scatterdiv4di (__addr, __mask, (__v4di) __index,
+                               (__v4di) __v1, __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_i64scatter_epi64 (void *__addr, __m128i __index,
+                     __m128i __v1, const int __scale)
+{
+  __builtin_ia32_scatterdiv2di (__addr, (__mmask8) 0xFF,
+                               (__v2di) __index, (__v2di) __v1,
+                               __scale);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_i64scatter_epi64 (void *__addr, __mmask8 __mask,
+                          __m128i __index, __m128i __v1,
+                          const int __scale)
+{
+  __builtin_ia32_scatterdiv2di (__addr, __mask, (__v2di) __index,
+                               (__v2di) __v1, __scale);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_shuffle_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+                          _MM_PERM_ENUM __mask)
+{
+  return (__m256i) __builtin_ia32_pshufd256_mask ((__v8si) __A, __mask,
+                                                 (__v8si) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_shuffle_epi32 (__mmask8 __U, __m256i __A,
+                           _MM_PERM_ENUM __mask)
+{
+  return (__m256i) __builtin_ia32_pshufd256_mask ((__v8si) __A, __mask,
+                                                 (__v8si)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_shuffle_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+                       _MM_PERM_ENUM __mask)
+{
+  return (__m128i) __builtin_ia32_pshufd128_mask ((__v4si) __A, __mask,
+                                                 (__v4si) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_shuffle_epi32 (__mmask8 __U, __m128i __A,
+                        _MM_PERM_ENUM __mask)
+{
+  return (__m128i) __builtin_ia32_pshufd128_mask ((__v4si) __A, __mask,
+                                                 (__v4si)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rol_epi32 (__m256i __A, const int __B)
+{
+  return (__m256i) __builtin_ia32_prold256_mask ((__v8si) __A, __B,
+                                                (__v8si)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_rol_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+                      const int __B)
+{
+  return (__m256i) __builtin_ia32_prold256_mask ((__v8si) __A, __B,
+                                                (__v8si) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_rol_epi32 (__mmask8 __U, __m256i __A, const int __B)
+{
+  return (__m256i) __builtin_ia32_prold256_mask ((__v8si) __A, __B,
+                                                (__v8si)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rol_epi32 (__m128i __A, const int __B)
+{
+  return (__m128i) __builtin_ia32_prold128_mask ((__v4si) __A, __B,
+                                                (__v4si)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rol_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+                   const int __B)
+{
+  return (__m128i) __builtin_ia32_prold128_mask ((__v4si) __A, __B,
+                                                (__v4si) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rol_epi32 (__mmask8 __U, __m128i __A, const int __B)
+{
+  return (__m128i) __builtin_ia32_prold128_mask ((__v4si) __A, __B,
+                                                (__v4si)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_ror_epi32 (__m256i __A, const int __B)
+{
+  return (__m256i) __builtin_ia32_prord256_mask ((__v8si) __A, __B,
+                                                (__v8si)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_ror_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+                      const int __B)
+{
+  return (__m256i) __builtin_ia32_prord256_mask ((__v8si) __A, __B,
+                                                (__v8si) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_ror_epi32 (__mmask8 __U, __m256i __A, const int __B)
+{
+  return (__m256i) __builtin_ia32_prord256_mask ((__v8si) __A, __B,
+                                                (__v8si)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ror_epi32 (__m128i __A, const int __B)
+{
+  return (__m128i) __builtin_ia32_prord128_mask ((__v4si) __A, __B,
+                                                (__v4si)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_ror_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+                   const int __B)
+{
+  return (__m128i) __builtin_ia32_prord128_mask ((__v4si) __A, __B,
+                                                (__v4si) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_ror_epi32 (__mmask8 __U, __m128i __A, const int __B)
+{
+  return (__m128i) __builtin_ia32_prord128_mask ((__v4si) __A, __B,
+                                                (__v4si)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rol_epi64 (__m256i __A, const int __B)
+{
+  return (__m256i) __builtin_ia32_prolq256_mask ((__v4di) __A, __B,
+                                                (__v4di)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_rol_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+                      const int __B)
+{
+  return (__m256i) __builtin_ia32_prolq256_mask ((__v4di) __A, __B,
+                                                (__v4di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_rol_epi64 (__mmask8 __U, __m256i __A, const int __B)
+{
+  return (__m256i) __builtin_ia32_prolq256_mask ((__v4di) __A, __B,
+                                                (__v4di)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rol_epi64 (__m128i __A, const int __B)
+{
+  return (__m128i) __builtin_ia32_prolq128_mask ((__v2di) __A, __B,
+                                                (__v2di)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_rol_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+                   const int __B)
+{
+  return (__m128i) __builtin_ia32_prolq128_mask ((__v2di) __A, __B,
+                                                (__v2di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_rol_epi64 (__mmask8 __U, __m128i __A, const int __B)
+{
+  return (__m128i) __builtin_ia32_prolq128_mask ((__v2di) __A, __B,
+                                                (__v2di)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_ror_epi64 (__m256i __A, const int __B)
+{
+  return (__m256i) __builtin_ia32_prorq256_mask ((__v4di) __A, __B,
+                                                (__v4di)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_ror_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+                      const int __B)
+{
+  return (__m256i) __builtin_ia32_prorq256_mask ((__v4di) __A, __B,
+                                                (__v4di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_ror_epi64 (__mmask8 __U, __m256i __A, const int __B)
+{
+  return (__m256i) __builtin_ia32_prorq256_mask ((__v4di) __A, __B,
+                                                (__v4di)
+                                                _mm256_setzero_si256 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ror_epi64 (__m128i __A, const int __B)
+{
+  return (__m128i) __builtin_ia32_prorq128_mask ((__v2di) __A, __B,
+                                                (__v2di)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_ror_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+                   const int __B)
+{
+  return (__m128i) __builtin_ia32_prorq128_mask ((__v2di) __A, __B,
+                                                (__v2di) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_ror_epi64 (__mmask8 __U, __m128i __A, const int __B)
+{
+  return (__m128i) __builtin_ia32_prorq128_mask ((__v2di) __A, __B,
+                                                (__v2di)
+                                                _mm_setzero_si128 (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_alignr_epi32 (__m128i __A, __m128i __B, const int __imm)
+{
+  return (__m128i) __builtin_ia32_alignd128_mask ((__v4si) __A,
+                                                 (__v4si) __B, __imm,
+                                                 (__v4si)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_alignr_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+                      __m128i __B, const int __imm)
+{
+  return (__m128i) __builtin_ia32_alignd128_mask ((__v4si) __A,
+                                                 (__v4si) __B, __imm,
+                                                 (__v4si) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_alignr_epi32 (__mmask8 __U, __m128i __A, __m128i __B,
+                       const int __imm)
+{
+  return (__m128i) __builtin_ia32_alignd128_mask ((__v4si) __A,
+                                                 (__v4si) __B, __imm,
+                                                 (__v4si)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_alignr_epi64 (__m128i __A, __m128i __B, const int __imm)
+{
+  return (__m128i) __builtin_ia32_alignq128_mask ((__v2di) __A,
+                                                 (__v2di) __B, __imm,
+                                                 (__v2di)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_alignr_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+                      __m128i __B, const int __imm)
+{
+  return (__m128i) __builtin_ia32_alignq128_mask ((__v2di) __A,
+                                                 (__v2di) __B, __imm,
+                                                 (__v2di) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_alignr_epi64 (__mmask8 __U, __m128i __A, __m128i __B,
+                       const int __imm)
+{
+  return (__m128i) __builtin_ia32_alignq128_mask ((__v2di) __A,
+                                                 (__v2di) __B, __imm,
+                                                 (__v2di)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_alignr_epi32 (__m256i __A, __m256i __B, const int __imm)
+{
+  return (__m256i) __builtin_ia32_alignd256_mask ((__v8si) __A,
+                                                 (__v8si) __B, __imm,
+                                                 (__v8si)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_alignr_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+                         __m256i __B, const int __imm)
+{
+  return (__m256i) __builtin_ia32_alignd256_mask ((__v8si) __A,
+                                                 (__v8si) __B, __imm,
+                                                 (__v8si) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_alignr_epi32 (__mmask8 __U, __m256i __A, __m256i __B,
+                          const int __imm)
+{
+  return (__m256i) __builtin_ia32_alignd256_mask ((__v8si) __A,
+                                                 (__v8si) __B, __imm,
+                                                 (__v8si)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_alignr_epi64 (__m256i __A, __m256i __B, const int __imm)
+{
+  return (__m256i) __builtin_ia32_alignq256_mask ((__v4di) __A,
+                                                 (__v4di) __B, __imm,
+                                                 (__v4di)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_alignr_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+                         __m256i __B, const int __imm)
+{
+  return (__m256i) __builtin_ia32_alignq256_mask ((__v4di) __A,
+                                                 (__v4di) __B, __imm,
+                                                 (__v4di) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_alignr_epi64 (__mmask8 __U, __m256i __A, __m256i __B,
+                          const int __imm)
+{
+  return (__m256i) __builtin_ia32_alignq256_mask ((__v4di) __A,
+                                                 (__v4di) __B, __imm,
+                                                 (__v4di)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cvtps_ph (__m128i __W, __mmask8 __U, __m128 __A,
+                  const int __I)
+{
+  return (__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf) __A, __I,
+                                                 (__v8hi) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_cvtps_ph (__mmask8 __U, __m128 __A, const int __I)
+{
+  return (__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf) __A, __I,
+                                                 (__v8hi)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cvtps_ph (__m128i __W, __mmask8 __U, __m256 __A,
+                     const int __I)
+{
+  return (__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf) __A, __I,
+                                                    (__v8hi) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_cvtps_ph (__mmask8 __U, __m256 __A, const int __I)
+{
+  return (__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf) __A, __I,
+                                                    (__v8hi)
+                                                    _mm_setzero_si128 (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_srai_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+                       const int __imm)
+{
+  return (__m256i) __builtin_ia32_psradi256_mask ((__v8si) __A, __imm,
+                                                 (__v8si) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_srai_epi32 (__mmask8 __U, __m256i __A, const int __imm)
+{
+  return (__m256i) __builtin_ia32_psradi256_mask ((__v8si) __A, __imm,
+                                                 (__v8si)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_srai_epi32 (__m128i __W, __mmask8 __U, __m128i __A,
+                    const int __imm)
+{
+  return (__m128i) __builtin_ia32_psradi128_mask ((__v4si) __A, __imm,
+                                                 (__v4si) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_srai_epi32 (__mmask8 __U, __m128i __A, const int __imm)
+{
+  return (__m128i) __builtin_ia32_psradi128_mask ((__v4si) __A, __imm,
+                                                 (__v4si)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_srai_epi64 (__m256i __A, const int __imm)
+{
+  return (__m256i) __builtin_ia32_psraqi256_mask ((__v4di) __A, __imm,
+                                                 (__v4di)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_srai_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+                       const int __imm)
+{
+  return (__m256i) __builtin_ia32_psraqi256_mask ((__v4di) __A, __imm,
+                                                 (__v4di) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_srai_epi64 (__mmask8 __U, __m256i __A, const int __imm)
+{
+  return (__m256i) __builtin_ia32_psraqi256_mask ((__v4di) __A, __imm,
+                                                 (__v4di)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srai_epi64 (__m128i __A, const int __imm)
+{
+  return (__m128i) __builtin_ia32_psraqi128_mask ((__v2di) __A, __imm,
+                                                 (__v2di)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) -1);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_srai_epi64 (__m128i __W, __mmask8 __U, __m128i __A,
+                    const int __imm)
+{
+  return (__m128i) __builtin_ia32_psraqi128_mask ((__v2di) __A, __imm,
+                                                 (__v2di) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_srai_epi64 (__mmask8 __U, __m128i __A, const int __imm)
+{
+  return (__m128i) __builtin_ia32_psraqi128_mask ((__v2di) __A, __imm,
+                                                 (__v2di)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_slli_epi32 (__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i) __builtin_ia32_pslldi128_mask ((__v4si) __A, __B,
+                                                 (__v4si) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_slli_epi32 (__mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i) __builtin_ia32_pslldi128_mask ((__v4si) __A, __B,
+                                                 (__v4si)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_slli_epi64 (__m128i __W, __mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i) __builtin_ia32_psllqi128_mask ((__v2di) __A, __B,
+                                                 (__v2di) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_slli_epi64 (__mmask8 __U, __m128i __A, int __B)
+{
+  return (__m128i) __builtin_ia32_psllqi128_mask ((__v2di) __A, __B,
+                                                 (__v2di)
+                                                 _mm_setzero_si128 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_slli_epi32 (__m256i __W, __mmask8 __U, __m256i __A,
+                       int __B)
+{
+  return (__m256i) __builtin_ia32_pslldi256_mask ((__v8si) __A, __B,
+                                                 (__v8si) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_slli_epi32 (__mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i) __builtin_ia32_pslldi256_mask ((__v8si) __A, __B,
+                                                 (__v8si)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_slli_epi64 (__m256i __W, __mmask8 __U, __m256i __A,
+                       int __B)
+{
+  return (__m256i) __builtin_ia32_psllqi256_mask ((__v4di) __A, __B,
+                                                 (__v4di) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_slli_epi64 (__mmask8 __U, __m256i __A, int __B)
+{
+  return (__m256i) __builtin_ia32_psllqi256_mask ((__v4di) __A, __B,
+                                                 (__v4di)
+                                                 _mm256_setzero_si256 (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permutex_pd (__m256d __W, __mmask8 __U, __m256d __X,
+                        const int __imm)
+{
+  return (__m256d) __builtin_ia32_permdf256_mask ((__v4df) __X, __imm,
+                                                 (__v4df) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permutex_pd (__mmask8 __U, __m256d __X, const int __imm)
+{
+  return (__m256d) __builtin_ia32_permdf256_mask ((__v4df) __X, __imm,
+                                                 (__v4df)
+                                                 _mm256_setzero_pd (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permute_pd (__m256d __W, __mmask8 __U, __m256d __X,
+                       const int __C)
+{
+  return (__m256d) __builtin_ia32_vpermilpd256_mask ((__v4df) __X, __C,
+                                                    (__v4df) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permute_pd (__mmask8 __U, __m256d __X, const int __C)
+{
+  return (__m256d) __builtin_ia32_vpermilpd256_mask ((__v4df) __X, __C,
+                                                    (__v4df)
+                                                    _mm256_setzero_pd (),
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_permute_pd (__m128d __W, __mmask8 __U, __m128d __X,
+                    const int __C)
+{
+  return (__m128d) __builtin_ia32_vpermilpd_mask ((__v2df) __X, __C,
+                                                 (__v2df) __W,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_permute_pd (__mmask8 __U, __m128d __X, const int __C)
+{
+  return (__m128d) __builtin_ia32_vpermilpd_mask ((__v2df) __X, __C,
+                                                 (__v2df)
+                                                 _mm_setzero_pd (),
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_permute_ps (__m256 __W, __mmask8 __U, __m256 __X,
+                       const int __C)
+{
+  return (__m256) __builtin_ia32_vpermilps256_mask ((__v8sf) __X, __C,
+                                                   (__v8sf) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_permute_ps (__mmask8 __U, __m256 __X, const int __C)
+{
+  return (__m256) __builtin_ia32_vpermilps256_mask ((__v8sf) __X, __C,
+                                                   (__v8sf)
+                                                   _mm256_setzero_ps (),
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_permute_ps (__m128 __W, __mmask8 __U, __m128 __X,
+                    const int __C)
+{
+  return (__m128) __builtin_ia32_vpermilps_mask ((__v4sf) __X, __C,
+                                                (__v4sf) __W,
+                                                (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_permute_ps (__mmask8 __U, __m128 __X, const int __C)
+{
+  return (__m128) __builtin_ia32_vpermilps_mask ((__v4sf) __X, __C,
+                                                (__v4sf)
+                                                _mm_setzero_ps (),
+                                                (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_blend_pd (__mmask8 __U, __m256d __A, __m256d __W)
+{
+  return (__m256d) __builtin_ia32_blendmpd_256_mask ((__v4df) __A,
+                                                    (__v4df) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_blend_ps (__mmask8 __U, __m256 __A, __m256 __W)
+{
+  return (__m256) __builtin_ia32_blendmps_256_mask ((__v8sf) __A,
+                                                   (__v8sf) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_blend_epi64 (__mmask8 __U, __m256i __A, __m256i __W)
+{
+  return (__m256i) __builtin_ia32_blendmq_256_mask ((__v4di) __A,
+                                                   (__v4di) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_blend_epi32 (__mmask8 __U, __m256i __A, __m256i __W)
+{
+  return (__m256i) __builtin_ia32_blendmd_256_mask ((__v8si) __A,
+                                                   (__v8si) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_blend_pd (__mmask8 __U, __m128d __A, __m128d __W)
+{
+  return (__m128d) __builtin_ia32_blendmpd_128_mask ((__v2df) __A,
+                                                    (__v2df) __W,
+                                                    (__mmask8) __U);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_blend_ps (__mmask8 __U, __m128 __A, __m128 __W)
+{
+  return (__m128) __builtin_ia32_blendmps_128_mask ((__v4sf) __A,
+                                                   (__v4sf) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_blend_epi64 (__mmask8 __U, __m128i __A, __m128i __W)
+{
+  return (__m128i) __builtin_ia32_blendmq_128_mask ((__v2di) __A,
+                                                   (__v2di) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_blend_epi32 (__mmask8 __U, __m128i __A, __m128i __W)
+{
+  return (__m128i) __builtin_ia32_blendmd_128_mask ((__v4si) __A,
+                                                   (__v4si) __W,
+                                                   (__mmask8) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_epi64_mask (__m256i __X, __m256i __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+                                                (__v4di) __Y, __P,
+                                                (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_epi32_mask (__m256i __X, __m256i __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+                                                (__v8si) __Y, __P,
+                                                (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_epu64_mask (__m256i __X, __m256i __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+                                                 (__v4di) __Y, __P,
+                                                 (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_epu32_mask (__m256i __X, __m256i __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+                                                 (__v8si) __Y, __P,
+                                                 (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_pd_mask (__m256d __X, __m256d __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmppd256_mask ((__v4df) __X,
+                                                 (__v4df) __Y, __P,
+                                                 (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_ps_mask (__m256 __X, __m256 __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmpps256_mask ((__v8sf) __X,
+                                                 (__v8sf) __Y, __P,
+                                                 (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmp_epi64_mask (__mmask8 __U, __m256i __X, __m256i __Y,
+                           const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X,
+                                                (__v4di) __Y, __P,
+                                                (__mmask8) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmp_epi32_mask (__mmask8 __U, __m256i __X, __m256i __Y,
+                           const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X,
+                                                (__v8si) __Y, __P,
+                                                (__mmask8) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmp_epu64_mask (__mmask8 __U, __m256i __X, __m256i __Y,
+                           const int __P)
+{
+  return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X,
+                                                 (__v4di) __Y, __P,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmp_epu32_mask (__mmask8 __U, __m256i __X, __m256i __Y,
+                           const int __P)
+{
+  return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X,
+                                                 (__v8si) __Y, __P,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmp_pd_mask (__mmask8 __U, __m256d __X, __m256d __Y,
+                        const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmppd256_mask ((__v4df) __X,
+                                                 (__v4df) __Y, __P,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_cmp_ps_mask (__mmask8 __U, __m256 __X, __m256 __Y,
+                        const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmpps256_mask ((__v8sf) __X,
+                                                 (__v8sf) __Y, __P,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_epi64_mask (__m128i __X, __m128i __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
+                                                (__v2di) __Y, __P,
+                                                (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_epi32_mask (__m128i __X, __m128i __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
+                                                (__v4si) __Y, __P,
+                                                (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_epu64_mask (__m128i __X, __m128i __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
+                                                 (__v2di) __Y, __P,
+                                                 (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_epu32_mask (__m128i __X, __m128i __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
+                                                 (__v4si) __Y, __P,
+                                                 (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_pd_mask (__m128d __X, __m128d __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmppd128_mask ((__v2df) __X,
+                                                 (__v2df) __Y, __P,
+                                                 (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_ps_mask (__m128 __X, __m128 __Y, const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmpps128_mask ((__v4sf) __X,
+                                                 (__v4sf) __Y, __P,
+                                                 (__mmask8) -1);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_epi64_mask (__mmask8 __U, __m128i __X, __m128i __Y,
+                        const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X,
+                                                (__v2di) __Y, __P,
+                                                (__mmask8) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_epi32_mask (__mmask8 __U, __m128i __X, __m128i __Y,
+                        const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X,
+                                                (__v4si) __Y, __P,
+                                                (__mmask8) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_epu64_mask (__mmask8 __U, __m128i __X, __m128i __Y,
+                        const int __P)
+{
+  return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X,
+                                                 (__v2di) __Y, __P,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_epu32_mask (__mmask8 __U, __m128i __X, __m128i __Y,
+                        const int __P)
+{
+  return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X,
+                                                 (__v4si) __Y, __P,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_pd_mask (__mmask8 __U, __m128d __X, __m128d __Y,
+                     const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmppd128_mask ((__v2df) __X,
+                                                 (__v2df) __Y, __P,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __mmask8
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_cmp_ps_mask (__mmask8 __U, __m128 __X, __m128 __Y,
+                     const int __P)
+{
+  return (__mmask8) __builtin_ia32_cmpps128_mask ((__v4sf) __X,
+                                                 (__v4sf) __Y, __P,
+                                                 (__mmask8) __U);
+}
+
+extern __inline __m256d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutex_pd (__m256d __X, const int __M)
+{
+  return (__m256d) __builtin_ia32_permdf256_mask ((__v4df) __X, __M,
+                                                 (__v4df)
+                                                 _mm256_undefined_pd (),
+                                                 (__mmask8) -1);
+}
+
+#else
+#define _mm256_permutex_pd(X, M)                                               \
+  ((__m256d) __builtin_ia32_permdf256_mask ((__v4df)(__m256d)(X), (int)(M),    \
+                                           (__v4df)(__m256d)                   \
+                                           _mm256_undefined_pd (),             \
+                                           (__mmask8)-1))
+
+#define _mm256_permutex_epi64(X, I)               \
+  ((__m256i) __builtin_ia32_permdi256_mask ((__v4di)(__m256i)(X), \
+                                           (int)(I),           \
+                                           (__v4di)(__m256i)   \
+                                           (_mm256_setzero_si256 ()),\
+                                           (__mmask8) -1))
+
+#define _mm256_maskz_permutex_epi64(M, X, I)                    \
+  ((__m256i) __builtin_ia32_permdi256_mask ((__v4di)(__m256i)(X),    \
+                                           (int)(I),                \
+                                           (__v4di)(__m256i)        \
+                                           (_mm256_setzero_si256 ()),\
+                                           (__mmask8)(M)))
+
+#define _mm256_mask_permutex_epi64(W, M, X, I)               \
+  ((__m256i) __builtin_ia32_permdi256_mask ((__v4di)(__m256i)(X), \
+                                           (int)(I),             \
+                                           (__v4di)(__m256i)(W), \
+                                           (__mmask8)(M)))
+
+#define _mm256_insertf32x4(X, Y, C)                                     \
+  ((__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf)(__m256) (X),  \
+    (__v4sf)(__m128) (Y), (int) (C),                                   \
+    (__v8sf)(__m256)_mm256_setzero_ps (),                              \
+    (__mmask8)-1))
+
+#define _mm256_mask_insertf32x4(W, U, X, Y, C)                          \
+  ((__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf)(__m256) (X),  \
+    (__v4sf)(__m128) (Y), (int) (C),                                   \
+    (__v8sf)(__m256)(W),                                               \
+    (__mmask8)(U)))
+
+#define _mm256_maskz_insertf32x4(U, X, Y, C)                            \
+  ((__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf)(__m256) (X), \
+    (__v4sf)(__m128) (Y), (int) (C),                                   \
+    (__v8sf)(__m256)_mm256_setzero_ps (),                              \
+    (__mmask8)(U)))
+
+#define _mm256_inserti32x4(X, Y, C)                                     \
+  ((__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si)(__m256i) (X),\
+    (__v4si)(__m128i) (Y), (int) (C),                                  \
+    (__v8si)(__m256i)_mm256_setzero_si256 (),                          \
+    (__mmask8)-1))
+
+#define _mm256_mask_inserti32x4(W, U, X, Y, C)                          \
+  ((__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si)(__m256i) (X),\
+    (__v4si)(__m128i) (Y), (int) (C),                                  \
+    (__v8si)(__m256i)(W),                                              \
+    (__mmask8)(U)))
+
+#define _mm256_maskz_inserti32x4(U, X, Y, C)                            \
+  ((__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si)(__m256i) (X),\
+    (__v4si)(__m128i) (Y), (int) (C),                                  \
+    (__v8si)(__m256i)_mm256_setzero_si256 (),                          \
+    (__mmask8)(U)))
+
+#define _mm256_extractf32x4_ps(X, C)                                    \
+  ((__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf)(__m256) (X), \
+    (int) (C),                                                         \
+    (__v4sf)(__m128)_mm_setzero_ps (),                                 \
+    (__mmask8)-1))
+
+#define _mm256_mask_extractf32x4_ps(W, U, X, C)                         \
+  ((__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf)(__m256) (X), \
+    (int) (C),                                                         \
+    (__v4sf)(__m128)(W),                                               \
+    (__mmask8)(U)))
+
+#define _mm256_maskz_extractf32x4_ps(U, X, C)                           \
+  ((__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf)(__m256) (X), \
+    (int) (C),                                                         \
+    (__v4sf)(__m128)_mm_setzero_ps (),                                 \
+    (__mmask8)(U)))
+
+#define _mm256_extracti32x4_epi32(X, C)                                 \
+  ((__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si)(__m256i) (X),\
+    (int) (C), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)-1))
+
+#define _mm256_mask_extracti32x4_epi32(W, U, X, C)                      \
+  ((__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si)(__m256i) (X),\
+    (int) (C), (__v4si)(__m128i)(W), (__mmask8)(U)))
+
+#define _mm256_maskz_extracti32x4_epi32(U, X, C)                        \
+  ((__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si)(__m256i) (X),\
+    (int) (C), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)(U)))
+
+#define _mm256_shuffle_i64x2(X, Y, C)                                                   \
+  ((__m256i)  __builtin_ia32_shuf_i64x2_256_mask ((__v4di)(__m256i)(X),                 \
+                                                  (__v4di)(__m256i)(Y), (int)(C),       \
+                                                  (__v4di)(__m256i)_mm256_setzero_si256 (), \
+                                                  (__mmask8)-1))
+
+#define _mm256_mask_shuffle_i64x2(W, U, X, Y, C)                                        \
+  ((__m256i)  __builtin_ia32_shuf_i64x2_256_mask ((__v4di)(__m256i)(X),                 \
+                                                  (__v4di)(__m256i)(Y), (int)(C),       \
+                                                  (__v4di)(__m256i)(W),\
+                                                  (__mmask8)(U)))
+
+#define _mm256_maskz_shuffle_i64x2(U, X, Y, C)                                          \
+  ((__m256i)  __builtin_ia32_shuf_i64x2_256_mask ((__v4di)(__m256i)(X),                 \
+                                                  (__v4di)(__m256i)(Y), (int)(C),       \
+                                                  (__v4di)(__m256i)_mm256_setzero_si256 (), \
+                                                  (__mmask8)(U)))
+
+#define _mm256_shuffle_i32x4(X, Y, C)                                                   \
+  ((__m256i)  __builtin_ia32_shuf_i32x4_256_mask ((__v8si)(__m256i)(X),                 \
+                                                  (__v8si)(__m256i)(Y), (int)(C),       \
+                                                 (__v8si)(__m256i)                     \
+                                                 _mm256_setzero_si256 (),              \
+                                                  (__mmask8)-1))
+
+#define _mm256_mask_shuffle_i32x4(W, U, X, Y, C)                                        \
+  ((__m256i)  __builtin_ia32_shuf_i32x4_256_mask ((__v8si)(__m256i)(X),                 \
+                                                  (__v8si)(__m256i)(Y), (int)(C),       \
+                                                  (__v8si)(__m256i)(W),                 \
+                                                  (__mmask8)(U)))
+
+#define _mm256_maskz_shuffle_i32x4(U, X, Y, C)                                          \
+  ((__m256i)  __builtin_ia32_shuf_i32x4_256_mask ((__v8si)(__m256i)(X),                 \
+                                                  (__v8si)(__m256i)(Y), (int)(C),       \
+                                                 (__v8si)(__m256i)                     \
+                                                 _mm256_setzero_si256 (),              \
+                                                  (__mmask8)(U)))
+
+#define _mm256_shuffle_f64x2(X, Y, C)                                                   \
+  ((__m256d)  __builtin_ia32_shuf_f64x2_256_mask ((__v4df)(__m256d)(X),                 \
+                                                  (__v4df)(__m256d)(Y), (int)(C),       \
+                                                 (__v4df)(__m256d)_mm256_setzero_pd (),\
+                                                  (__mmask8)-1))
+
+#define _mm256_mask_shuffle_f64x2(W, U, X, Y, C)                                        \
+  ((__m256d)  __builtin_ia32_shuf_f64x2_256_mask ((__v4df)(__m256d)(X),                 \
+                                                  (__v4df)(__m256d)(Y), (int)(C),       \
+                                                  (__v4df)(__m256d)(W),                 \
+                                                  (__mmask8)(U)))
+
+#define _mm256_maskz_shuffle_f64x2(U, X, Y, C)                                          \
+  ((__m256d)  __builtin_ia32_shuf_f64x2_256_mask ((__v4df)(__m256d)(X),                 \
+                                                  (__v4df)(__m256d)(Y), (int)(C),       \
+                                                 (__v4df)(__m256d)_mm256_setzero_pd( ),\
+                                                  (__mmask8)(U)))
+
+#define _mm256_shuffle_f32x4(X, Y, C)                                                   \
+  ((__m256)  __builtin_ia32_shuf_f32x4_256_mask ((__v8sf)(__m256)(X),                   \
+                                                 (__v8sf)(__m256)(Y), (int)(C),         \
+                                                (__v8sf)(__m256)_mm256_setzero_ps (),  \
+                                                 (__mmask8)-1))
+
+#define _mm256_mask_shuffle_f32x4(W, U, X, Y, C)                                        \
+  ((__m256)  __builtin_ia32_shuf_f32x4_256_mask ((__v8sf)(__m256)(X),                   \
+                                                 (__v8sf)(__m256)(Y), (int)(C),         \
+                                                 (__v8sf)(__m256)(W),                   \
+                                                 (__mmask8)(U)))
+
+#define _mm256_maskz_shuffle_f32x4(U, X, Y, C)                                          \
+  ((__m256)  __builtin_ia32_shuf_f32x4_256_mask ((__v8sf)(__m256)(X),                   \
+                                                 (__v8sf)(__m256)(Y), (int)(C),         \
+                                                (__v8sf)(__m256)_mm256_setzero_ps (),  \
+                                                 (__mmask8)(U)))
+
+#define _mm256_mask_shuffle_pd(W, U, A, B, C)                                   \
+  ((__m256d)__builtin_ia32_shufpd256_mask ((__v4df)(__m256d)(A),                \
+                                           (__v4df)(__m256d)(B), (int)(C),      \
+                                           (__v4df)(__m256d)(W),                \
+                                           (__mmask8)(U)))
+
+#define _mm256_maskz_shuffle_pd(U, A, B, C)                                     \
+  ((__m256d)__builtin_ia32_shufpd256_mask ((__v4df)(__m256d)(A),                \
+                                           (__v4df)(__m256d)(B), (int)(C),      \
+                                          (__v4df)(__m256d)                    \
+                                          _mm256_setzero_pd (),                \
+                                           (__mmask8)(U)))
+
+#define _mm_mask_shuffle_pd(W, U, A, B, C)                                      \
+  ((__m128d)__builtin_ia32_shufpd128_mask ((__v2df)(__m128d)(A),                \
+                                           (__v2df)(__m128d)(B), (int)(C),      \
+                                           (__v2df)(__m128d)(W),                \
+                                           (__mmask8)(U)))
+
+#define _mm_maskz_shuffle_pd(U, A, B, C)                                        \
+  ((__m128d)__builtin_ia32_shufpd128_mask ((__v2df)(__m128d)(A),                \
+                                           (__v2df)(__m128d)(B), (int)(C),      \
+                                          (__v2df)(__m128d)_mm_setzero_pd (),  \
+                                           (__mmask8)(U)))
+
+#define _mm256_mask_shuffle_ps(W, U, A, B, C)                                   \
+  ((__m256) __builtin_ia32_shufps256_mask ((__v8sf)(__m256)(A),                 \
+                                           (__v8sf)(__m256)(B), (int)(C),       \
+                                           (__v8sf)(__m256)(W),                 \
+                                           (__mmask8)(U)))
+
+#define _mm256_maskz_shuffle_ps(U, A, B, C)                                     \
+  ((__m256) __builtin_ia32_shufps256_mask ((__v8sf)(__m256)(A),                 \
+                                           (__v8sf)(__m256)(B), (int)(C),       \
+                                          (__v8sf)(__m256)_mm256_setzero_ps (),\
+                                           (__mmask8)(U)))
+
+#define _mm_mask_shuffle_ps(W, U, A, B, C)                                      \
+  ((__m128) __builtin_ia32_shufps128_mask ((__v4sf)(__m128)(A),                 \
+                                           (__v4sf)(__m128)(B), (int)(C),       \
+                                           (__v4sf)(__m128)(W),                 \
+                                           (__mmask8)(U)))
+
+#define _mm_maskz_shuffle_ps(U, A, B, C)                                        \
+  ((__m128) __builtin_ia32_shufps128_mask ((__v4sf)(__m128)(A),                 \
+                                           (__v4sf)(__m128)(B), (int)(C),       \
+                                          (__v4sf)(__m128)_mm_setzero_ps (),   \
+                                           (__mmask8)(U)))
+
+#define _mm256_fixupimm_pd(X, Y, Z, C)                                          \
+  ((__m256d)__builtin_ia32_fixupimmpd256_mask ((__v4df)(__m256d)(X),           \
+                                              (__v4df)(__m256d)(Y),            \
+                                              (__v4di)(__m256i)(Z), (int)(C),  \
+                                              (__mmask8)(-1)))
+
+#define _mm256_mask_fixupimm_pd(X, U, Y, Z, C)                                  \
+   ((__m256d)__builtin_ia32_fixupimmpd256_mask ((__v4df)(__m256d)(X),           \
+                                               (__v4df)(__m256d)(Y),           \
+                                               (__v4di)(__m256i)(Z), (int)(C), \
+                                               (__mmask8)(U)))
+
+#define _mm256_maskz_fixupimm_pd(U, X, Y, Z, C)                                 \
+   ((__m256d)__builtin_ia32_fixupimmpd256_maskz ((__v4df)(__m256d)(X),          \
+                                                (__v4df)(__m256d)(Y),          \
+                                                (__v4di)(__m256i)(Z), (int)(C),\
+                                                (__mmask8)(U)))
+
+#define _mm256_fixupimm_ps(X, Y, Z, C)                                         \
+  ((__m256)__builtin_ia32_fixupimmps256_mask ((__v8sf)(__m256)(X),             \
+                                             (__v8sf)(__m256)(Y),              \
+                                             (__v8si)(__m256i)(Z), (int)(C),   \
+                                             (__mmask8)(-1)))
+
+
+#define _mm256_mask_fixupimm_ps(X, U, Y, Z, C)                                  \
+    ((__m256)__builtin_ia32_fixupimmps256_mask ((__v8sf)(__m256)(X),            \
+                                               (__v8sf)(__m256)(Y),            \
+                                               (__v8si)(__m256i)(Z), (int)(C), \
+                                               (__mmask8)(U)))
+
+#define _mm256_maskz_fixupimm_ps(U, X, Y, Z, C)                                 \
+    ((__m256)__builtin_ia32_fixupimmps256_maskz ((__v8sf)(__m256)(X),           \
+                                                (__v8sf)(__m256)(Y),           \
+                                                (__v8si)(__m256i)(Z), (int)(C),\
+                                                (__mmask8)(U)))
+
+#define _mm_fixupimm_pd(X, Y, Z, C)                                            \
+  ((__m128d)__builtin_ia32_fixupimmpd128_mask ((__v2df)(__m128d)(X),           \
+                                              (__v2df)(__m128d)(Y),            \
+                                              (__v2di)(__m128i)(Z), (int)(C),  \
+                                              (__mmask8)(-1)))
+
+
+#define _mm_mask_fixupimm_pd(X, U, Y, Z, C)                                       \
+     ((__m128d)__builtin_ia32_fixupimmpd128_mask ((__v2df)(__m128d)(X),           \
+                                                 (__v2df)(__m128d)(Y),           \
+                                                 (__v2di)(__m128i)(Z), (int)(C), \
+                                                 (__mmask8)(U)))
+
+#define _mm_maskz_fixupimm_pd(U, X, Y, Z, C)                                      \
+     ((__m128d)__builtin_ia32_fixupimmpd128_maskz ((__v2df)(__m128d)(X),          \
+                                                  (__v2df)(__m128d)(Y),          \
+                                                  (__v2di)(__m128i)(Z), (int)(C),\
+                                                  (__mmask8)(U)))
+
+#define _mm_fixupimm_ps(X, Y, Z, C)                                            \
+   ((__m128)__builtin_ia32_fixupimmps128_mask ((__v4sf)(__m128)(X),            \
+                                              (__v4sf)(__m128)(Y),             \
+                                              (__v4si)(__m128i)(Z), (int)(C),  \
+                                              (__mmask8)(-1)))
+
+#define _mm_mask_fixupimm_ps(X, U, Y, Z, C)                                      \
+      ((__m128)__builtin_ia32_fixupimmps128_mask ((__v4sf)(__m128)(X),           \
+                                                 (__v4sf)(__m128)(Y),           \
+                                                 (__v4si)(__m128i)(Z), (int)(C),\
+                                                 (__mmask8)(U)))
+
+#define _mm_maskz_fixupimm_ps(U, X, Y, Z, C)                                      \
+      ((__m128)__builtin_ia32_fixupimmps128_maskz ((__v4sf)(__m128)(X),           \
+                                                  (__v4sf)(__m128)(Y),           \
+                                                  (__v4si)(__m128i)(Z), (int)(C),\
+                                                  (__mmask8)(U)))
+
+#define _mm256_mask_srli_epi32(W, U, A, B)                             \
+  ((__m256i) __builtin_ia32_psrldi256_mask ((__v8si)(__m256i)(A),      \
+    (int)(B), (__v8si)(__m256i)(W), (__mmask8)(U)))
+
+#define _mm256_maskz_srli_epi32(U, A, B)                               \
+  ((__m256i) __builtin_ia32_psrldi256_mask ((__v8si)(__m256i)(A),      \
+    (int)(B), (__v8si)_mm256_setzero_si256 (), (__mmask8)(U)))
+
+#define _mm_mask_srli_epi32(W, U, A, B)                                 \
+  ((__m128i) __builtin_ia32_psrldi128_mask ((__v4si)(__m128i)(A),       \
+    (int)(B), (__v4si)(__m128i)(W), (__mmask8)(U)))
+
+#define _mm_maskz_srli_epi32(U, A, B)                                   \
+  ((__m128i) __builtin_ia32_psrldi128_mask ((__v4si)(__m128i)(A),       \
+    (int)(B), (__v4si)_mm_setzero_si128 (), (__mmask8)(U)))
+
+#define _mm256_mask_srli_epi64(W, U, A, B)                             \
+  ((__m256i) __builtin_ia32_psrlqi256_mask ((__v4di)(__m256i)(A),      \
+    (int)(B), (__v4di)(__m256i)(W), (__mmask8)(U)))
+
+#define _mm256_maskz_srli_epi64(U, A, B)                               \
+  ((__m256i) __builtin_ia32_psrlqi256_mask ((__v4di)(__m256i)(A),      \
+    (int)(B), (__v4di)_mm256_setzero_si256 (), (__mmask8)(U)))
+
+#define _mm_mask_srli_epi64(W, U, A, B)                                 \
+  ((__m128i) __builtin_ia32_psrlqi128_mask ((__v2di)(__m128i)(A),       \
+    (int)(B), (__v2di)(__m128i)(W), (__mmask8)(U)))
+
+#define _mm_maskz_srli_epi64(U, A, B)                                   \
+  ((__m128i) __builtin_ia32_psrlqi128_mask ((__v2di)(__m128i)(A),       \
+    (int)(B), (__v2di)_mm_setzero_si128 (), (__mmask8)(U)))
+
+#define _mm256_mask_slli_epi32(W, U, X, C)                                \
+  ((__m256i)__builtin_ia32_pslldi256_mask ((__v8si)(__m256i)(X), (int)(C),\
+    (__v8si)(__m256i)(W),                                                \
+    (__mmask8)(U)))
+
+#define _mm256_maskz_slli_epi32(U, X, C)                                  \
+  ((__m256i)__builtin_ia32_pslldi256_mask ((__v8si)(__m256i)(X), (int)(C),\
+    (__v8si)(__m256i)_mm256_setzero_si256 (),                            \
+    (__mmask8)(U)))
+
+#define _mm256_mask_slli_epi64(W, U, X, C)                                \
+  ((__m256i)__builtin_ia32_psllqi256_mask ((__v4di)(__m256i)(X), (int)(C),\
+    (__v4di)(__m256i)(W),                                                \
+    (__mmask8)(U)))
+
+#define _mm256_maskz_slli_epi64(U, X, C)                                  \
+  ((__m256i)__builtin_ia32_psllqi256_mask ((__v4di)(__m256i)(X), (int)(C),\
+    (__v4di)(__m256i)_mm256_setzero_si256 (),                            \
+    (__mmask8)(U)))
+
+#define _mm_mask_slli_epi32(W, U, X, C)                                          \
+  ((__m128i)__builtin_ia32_pslldi128_mask ((__v4si)(__m128i)(X), (int)(C),\
+    (__v4si)(__m128i)(W),\
+    (__mmask8)(U)))
+
+#define _mm_maskz_slli_epi32(U, X, C)                                    \
+  ((__m128i)__builtin_ia32_pslldi128_mask ((__v4si)(__m128i)(X), (int)(C),\
+    (__v4si)(__m128i)_mm_setzero_si128 (),\
+    (__mmask8)(U)))
+
+#define _mm_mask_slli_epi64(W, U, X, C)                                          \
+  ((__m128i)__builtin_ia32_psllqi128_mask ((__v2di)(__m128i)(X), (int)(C),\
+    (__v2di)(__m128i)(W),\
+    (__mmask8)(U)))
+
+#define _mm_maskz_slli_epi64(U, X, C)                                    \
+  ((__m128i)__builtin_ia32_psllqi128_mask ((__v2di)(__m128i)(X), (int)(C),\
+    (__v2di)(__m128i)_mm_setzero_si128 (),\
+    (__mmask8)(U)))
+
+#define _mm256_ternarylogic_epi64(A, B, C, I)                  \
+  ((__m256i)                                                   \
+   __builtin_ia32_pternlogq256_mask ((__v4di) (__m256i) (A),   \
+                                    (__v4di) (__m256i) (B),    \
+                                    (__v4di) (__m256i) (C),    \
+                                    (unsigned char) (I),       \
+                                    (__mmask8) -1))
+
+#define _mm256_mask_ternarylogic_epi64(A, U, B, C, I)          \
+  ((__m256i)                                                   \
+   __builtin_ia32_pternlogq256_mask ((__v4di) (__m256i) (A),   \
+                                    (__v4di) (__m256i) (B),    \
+                                    (__v4di) (__m256i) (C),    \
+                                    (unsigned char) (I),       \
+                                    (__mmask8) (U)))
+
+#define _mm256_maskz_ternarylogic_epi64(U, A, B, C, I)         \
+  ((__m256i)                                                   \
+   __builtin_ia32_pternlogq256_maskz ((__v4di) (__m256i) (A),  \
+                                     (__v4di) (__m256i) (B),   \
+                                     (__v4di) (__m256i) (C),   \
+                                     (unsigned char) (I),      \
+                                     (__mmask8) (U)))
+
+#define _mm256_ternarylogic_epi32(A, B, C, I)                  \
+  ((__m256i)                                                   \
+   __builtin_ia32_pternlogd256_mask ((__v8si) (__m256i) (A),   \
+                                    (__v8si) (__m256i) (B),    \
+                                    (__v8si) (__m256i) (C),    \
+                                    (unsigned char) (I),       \
+                                    (__mmask8) -1))
+
+#define _mm256_mask_ternarylogic_epi32(A, U, B, C, I)          \
+  ((__m256i)                                                   \
+   __builtin_ia32_pternlogd256_mask ((__v8si) (__m256i) (A),   \
+                                    (__v8si) (__m256i) (B),    \
+                                    (__v8si) (__m256i) (C),    \
+                                    (unsigned char) (I),       \
+                                    (__mmask8) (U)))
+
+#define _mm256_maskz_ternarylogic_epi32(U, A, B, C, I)         \
+  ((__m256i)                                                   \
+   __builtin_ia32_pternlogd256_maskz ((__v8si) (__m256i) (A),  \
+                                     (__v8si) (__m256i) (B),   \
+                                     (__v8si) (__m256i) (C),   \
+                                     (unsigned char) (I),      \
+                                     (__mmask8) (U)))
+
+#define _mm_ternarylogic_epi64(A, B, C, I)                     \
+  ((__m128i)                                                   \
+   __builtin_ia32_pternlogq128_mask ((__v2di) (__m128i) (A),   \
+                                    (__v2di) (__m128i) (B),    \
+                                    (__v2di) (__m128i) (C),    \
+                                    (unsigned char) (I),       \
+                                    (__mmask8) -1))
+
+#define _mm_mask_ternarylogic_epi64(A, U, B, C, I)             \
+  ((__m128i)                                                   \
+   __builtin_ia32_pternlogq128_mask ((__v2di) (__m128i) (A),   \
+                                    (__v2di) (__m128i) (B),    \
+                                    (__v2di) (__m128i) (C),    \
+                                    (unsigned char) (I),       \
+                                    (__mmask8) (U)))
+
+#define _mm_maskz_ternarylogic_epi64(U, A, B, C, I)            \
+  ((__m128i)                                                   \
+   __builtin_ia32_pternlogq128_maskz ((__v2di) (__m128i) (A),  \
+                                     (__v2di) (__m128i) (B),   \
+                                     (__v2di) (__m128i) (C),   \
+                                     (unsigned char) (I),      \
+                                     (__mmask8) (U)))
+
+#define _mm_ternarylogic_epi32(A, B, C, I)                     \
+  ((__m128i)                                                   \
+   __builtin_ia32_pternlogd128_mask ((__v4si) (__m128i) (A),   \
+                                    (__v4si) (__m128i) (B),    \
+                                    (__v4si) (__m128i) (C),    \
+                                    (unsigned char) (I),       \
+                                    (__mmask8) -1))
+
+#define _mm_mask_ternarylogic_epi32(A, U, B, C, I)             \
+  ((__m128i)                                                   \
+   __builtin_ia32_pternlogd128_mask ((__v4si) (__m128i) (A),   \
+                                    (__v4si) (__m128i) (B),    \
+                                    (__v4si) (__m128i) (C),    \
+                                    (unsigned char) (I),       \
+                                    (__mmask8) (U)))
+
+#define _mm_maskz_ternarylogic_epi32(U, A, B, C, I)            \
+  ((__m128i)                                                   \
+   __builtin_ia32_pternlogd128_maskz ((__v4si) (__m128i) (A),  \
+                                     (__v4si) (__m128i) (B),   \
+                                     (__v4si) (__m128i) (C),   \
+                                     (unsigned char) (I),      \
+                                     (__mmask8) (U)))
+
+#define _mm256_roundscale_ps(A, B)                                     \
+  ((__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf)(__m256)(A),    \
+    (int)(B), (__v8sf)(__m256)_mm256_setzero_ps (), (__mmask8)-1))
+
+#define _mm256_mask_roundscale_ps(W, U, A, B)                          \
+  ((__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf)(__m256)(A),    \
+    (int)(B), (__v8sf)(__m256)(W), (__mmask8)(U)))
+
+#define _mm256_maskz_roundscale_ps(U, A, B)                            \
+  ((__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf)(__m256)(A),    \
+    (int)(B), (__v8sf)(__m256)_mm256_setzero_ps (), (__mmask8)(U)))
+
+#define _mm256_roundscale_pd(A, B)                                     \
+  ((__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df)(__m256d)(A),  \
+    (int)(B), (__v4df)(__m256d)_mm256_setzero_pd (), (__mmask8)-1))
+
+#define _mm256_mask_roundscale_pd(W, U, A, B)                          \
+  ((__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df)(__m256d)(A),  \
+    (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U)))
+
+#define _mm256_maskz_roundscale_pd(U, A, B)                            \
+  ((__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df)(__m256d)(A),  \
+    (int)(B), (__v4df)(__m256d)_mm256_setzero_pd (), (__mmask8)(U)))
+
+#define _mm_roundscale_ps(A, B)                                                \
+  ((__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf)(__m128)(A),    \
+    (int)(B), (__v4sf)(__m128)_mm_setzero_ps (), (__mmask8)-1))
+
+#define _mm_mask_roundscale_ps(W, U, A, B)                             \
+  ((__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf)(__m128)(A),    \
+    (int)(B), (__v4sf)(__m128)(W), (__mmask8)(U)))
+
+#define _mm_maskz_roundscale_ps(U, A, B)                               \
+  ((__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf)(__m128)(A),    \
+    (int)(B), (__v4sf)(__m128)_mm_setzero_ps (), (__mmask8)(U)))
+
+#define _mm_roundscale_pd(A, B)                                                \
+  ((__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df)(__m128d)(A),  \
+    (int)(B), (__v2df)(__m128d)_mm_setzero_pd (), (__mmask8)-1))
+
+#define _mm_mask_roundscale_pd(W, U, A, B)                             \
+  ((__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df)(__m128d)(A),  \
+    (int)(B), (__v2df)(__m128d)(W), (__mmask8)(U)))
+
+#define _mm_maskz_roundscale_pd(U, A, B)                               \
+  ((__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df)(__m128d)(A),  \
+    (int)(B), (__v2df)(__m128d)_mm_setzero_pd (), (__mmask8)(U)))
+
+#define _mm256_getmant_ps(X, B, C)                                              \
+  ((__m256) __builtin_ia32_getmantps256_mask ((__v8sf)(__m256) (X),             \
+                                         (int)(((C)<<2) | (B)),                 \
+                                        (__v8sf)(__m256)_mm256_setzero_ps (),  \
+                                         (__mmask8)-1))
+
+#define _mm256_mask_getmant_ps(W, U, X, B, C)                                   \
+  ((__m256) __builtin_ia32_getmantps256_mask ((__v8sf)(__m256) (X),             \
+                                         (int)(((C)<<2) | (B)),                 \
+                                         (__v8sf)(__m256)(W),                   \
+                                         (__mmask8)(U)))
+
+#define _mm256_maskz_getmant_ps(U, X, B, C)                                     \
+  ((__m256) __builtin_ia32_getmantps256_mask ((__v8sf)(__m256) (X),             \
+                                         (int)(((C)<<2) | (B)),                 \
+                                        (__v8sf)(__m256)_mm256_setzero_ps (),  \
+                                         (__mmask8)(U)))
+
+#define _mm_getmant_ps(X, B, C)                                                 \
+  ((__m128) __builtin_ia32_getmantps128_mask ((__v4sf)(__m128) (X),             \
+                                         (int)(((C)<<2) | (B)),                 \
+                                        (__v4sf)(__m128)_mm_setzero_ps (),     \
+                                         (__mmask8)-1))
+
+#define _mm_mask_getmant_ps(W, U, X, B, C)                                      \
+  ((__m128) __builtin_ia32_getmantps128_mask ((__v4sf)(__m128) (X),             \
+                                         (int)(((C)<<2) | (B)),                 \
+                                         (__v4sf)(__m128)(W),                   \
+                                         (__mmask8)(U)))
+
+#define _mm_maskz_getmant_ps(U, X, B, C)                                        \
+  ((__m128) __builtin_ia32_getmantps128_mask ((__v4sf)(__m128) (X),             \
+                                         (int)(((C)<<2) | (B)),                 \
+                                        (__v4sf)(__m128)_mm_setzero_ps (),     \
+                                         (__mmask8)(U)))
+
+#define _mm256_getmant_pd(X, B, C)                                              \
+  ((__m256d) __builtin_ia32_getmantpd256_mask ((__v4df)(__m256d) (X),           \
+                                         (int)(((C)<<2) | (B)),                 \
+                                         (__v4df)(__m256d)_mm256_setzero_pd (),\
+                                          (__mmask8)-1))
+
+#define _mm256_mask_getmant_pd(W, U, X, B, C)                                   \
+  ((__m256d) __builtin_ia32_getmantpd256_mask ((__v4df)(__m256d) (X),           \
+                                         (int)(((C)<<2) | (B)),                 \
+                                          (__v4df)(__m256d)(W),                 \
+                                          (__mmask8)(U)))
+
+#define _mm256_maskz_getmant_pd(U, X, B, C)                                     \
+  ((__m256d) __builtin_ia32_getmantpd256_mask ((__v4df)(__m256d) (X),           \
+                                         (int)(((C)<<2) | (B)),                 \
+                                         (__v4df)(__m256d)_mm256_setzero_pd (),\
+                                          (__mmask8)(U)))
+
+#define _mm_getmant_pd(X, B, C)                                                 \
+  ((__m128d) __builtin_ia32_getmantpd128_mask ((__v2df)(__m128d) (X),           \
+                                         (int)(((C)<<2) | (B)),                 \
+                                         (__v2df)(__m128d)_mm_setzero_pd (),   \
+                                          (__mmask8)-1))
+
+#define _mm_mask_getmant_pd(W, U, X, B, C)                                      \
+  ((__m128d) __builtin_ia32_getmantpd128_mask ((__v2df)(__m128d) (X),           \
+                                         (int)(((C)<<2) | (B)),                 \
+                                          (__v2df)(__m128d)(W),                 \
+                                          (__mmask8)(U)))
+
+#define _mm_maskz_getmant_pd(U, X, B, C)                                        \
+  ((__m128d) __builtin_ia32_getmantpd128_mask ((__v2df)(__m128d) (X),           \
+                                         (int)(((C)<<2) | (B)),                 \
+                                         (__v2df)(__m128d)_mm_setzero_pd (),   \
+                                          (__mmask8)(U)))
+
+#define _mm256_mmask_i32gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE)     \
+  (__m256) __builtin_ia32_gather3siv8sf ((__v8sf)(__m256) (V1OLD),     \
+                                        (void const *) (ADDR),         \
+                                        (__v8si)(__m256i) (INDEX),     \
+                                        (__mmask8) (MASK),             \
+                                        (int) (SCALE))
+
+#define _mm_mmask_i32gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE)                \
+  (__m128) __builtin_ia32_gather3siv4sf ((__v4sf)(__m128) (V1OLD),     \
+                                        (void const *) (ADDR),         \
+                                        (__v4si)(__m128i) (INDEX),     \
+                                        (__mmask8) (MASK),             \
+                                        (int) (SCALE))
+
+#define _mm256_mmask_i32gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE)     \
+  (__m256d) __builtin_ia32_gather3siv4df ((__v4df)(__m256d) (V1OLD),   \
+                                         (void const *) (ADDR),        \
+                                         (__v4si)(__m128i) (INDEX),    \
+                                         (__mmask8) (MASK),            \
+                                         (int) (SCALE))
+
+#define _mm_mmask_i32gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE)                \
+  (__m128d) __builtin_ia32_gather3siv2df ((__v2df)(__m128d) (V1OLD),   \
+                                         (void const *) (ADDR),        \
+                                         (__v4si)(__m128i) (INDEX),    \
+                                         (__mmask8) (MASK),            \
+                                         (int) (SCALE))
+
+#define _mm256_mmask_i64gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE)     \
+  (__m128) __builtin_ia32_gather3div8sf ((__v4sf)(__m128) (V1OLD),     \
+                                        (void const *) (ADDR),         \
+                                        (__v4di)(__m256i) (INDEX),     \
+                                        (__mmask8) (MASK),             \
+                                        (int) (SCALE))
+
+#define _mm_mmask_i64gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE)                \
+  (__m128) __builtin_ia32_gather3div4sf ((__v4sf)(__m128) (V1OLD),     \
+                                        (void const *) (ADDR),         \
+                                        (__v2di)(__m128i) (INDEX),     \
+                                        (__mmask8) (MASK),             \
+                                        (int) (SCALE))
+
+#define _mm256_mmask_i64gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE)     \
+  (__m256d) __builtin_ia32_gather3div4df ((__v4df)(__m256d) (V1OLD),   \
+                                         (void const *) (ADDR),        \
+                                         (__v4di)(__m256i) (INDEX),    \
+                                         (__mmask8) (MASK),            \
+                                         (int) (SCALE))
+
+#define _mm_mmask_i64gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE)                \
+  (__m128d) __builtin_ia32_gather3div2df ((__v2df)(__m128d) (V1OLD),   \
+                                         (void const *) (ADDR),        \
+                                         (__v2di)(__m128i) (INDEX),    \
+                                         (__mmask8) (MASK),            \
+                                         (int) (SCALE))
+
+#define _mm256_mmask_i32gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE)  \
+  (__m256i) __builtin_ia32_gather3siv8si ((__v8si)(__m256i) (V1OLD),   \
+                                         (void const *) (ADDR),        \
+                                         (__v8si)(__m256i) (INDEX),    \
+                                         (__mmask8) (MASK),            \
+                                         (int) (SCALE))
+
+#define _mm_mmask_i32gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE)     \
+  (__m128i) __builtin_ia32_gather3siv4si ((__v4si)(__m128i) (V1OLD),   \
+                                         (void const *) (ADDR),        \
+                                         (__v4si)(__m128i) (INDEX),    \
+                                         (__mmask8) (MASK),            \
+                                         (int) (SCALE))
+
+#define _mm256_mmask_i32gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE)  \
+  (__m256i) __builtin_ia32_gather3siv4di ((__v4di)(__m256i) (V1OLD),   \
+                                         (void const *) (ADDR),        \
+                                         (__v4si)(__m128i) (INDEX),    \
+                                         (__mmask8) (MASK),            \
+                                         (int) (SCALE))
+
+#define _mm_mmask_i32gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE)     \
+  (__m128i) __builtin_ia32_gather3siv2di ((__v2di)(__m128i) (V1OLD),   \
+                                         (void const *) (ADDR),        \
+                                         (__v4si)(__m128i) (INDEX),    \
+                                         (__mmask8) (MASK),            \
+                                         (int) (SCALE))
+
+#define _mm256_mmask_i64gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE)  \
+  (__m128i) __builtin_ia32_gather3div8si ((__v4si)(__m128i) (V1OLD),   \
+                                         (void const *) (ADDR),        \
+                                         (__v4di)(__m256i) (INDEX),    \
+                                         (__mmask8) (MASK),            \
+                                         (int) (SCALE))
+
+#define _mm_mmask_i64gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE)     \
+  (__m128i) __builtin_ia32_gather3div4si ((__v4si)(__m128i) (V1OLD),   \
+                                         (void const *) (ADDR),        \
+                                         (__v2di)(__m128i) (INDEX),    \
+                                         (__mmask8) (MASK),            \
+                                         (int) (SCALE))
+
+#define _mm256_mmask_i64gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE)  \
+  (__m256i) __builtin_ia32_gather3div4di ((__v4di)(__m256i) (V1OLD),   \
+                                         (void const *) (ADDR),        \
+                                         (__v4di)(__m256i) (INDEX),    \
+                                         (__mmask8) (MASK),            \
+                                         (int) (SCALE))
+
+#define _mm_mmask_i64gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE)     \
+  (__m128i) __builtin_ia32_gather3div2di ((__v2di)(__m128i) (V1OLD),   \
+                                         (void const *) (ADDR),        \
+                                         (__v2di)(__m128i) (INDEX),    \
+                                         (__mmask8) (MASK),            \
+                                         (int) (SCALE))
+
+#define _mm256_i32scatter_ps(ADDR, INDEX, V1, SCALE)                   \
+  __builtin_ia32_scattersiv8sf ((void *) (ADDR), (__mmask8)0xFF,       \
+                               (__v8si)(__m256i) (INDEX),              \
+                               (__v8sf)(__m256) (V1), (int) (SCALE))
+
+#define _mm256_mask_i32scatter_ps(ADDR, MASK, INDEX, V1, SCALE)                \
+  __builtin_ia32_scattersiv8sf ((void *) (ADDR), (__mmask8) (MASK),    \
+                               (__v8si)(__m256i) (INDEX),              \
+                               (__v8sf)(__m256) (V1), (int) (SCALE))
+
+#define _mm_i32scatter_ps(ADDR, INDEX, V1, SCALE)                      \
+  __builtin_ia32_scattersiv4sf ((void *) (ADDR), (__mmask8)0xFF,       \
+                               (__v4si)(__m128i) (INDEX),              \
+                               (__v4sf)(__m128) (V1), (int) (SCALE))
+
+#define _mm_mask_i32scatter_ps(ADDR, MASK, INDEX, V1, SCALE)           \
+  __builtin_ia32_scattersiv4sf ((void *) (ADDR), (__mmask8) (MASK),    \
+                               (__v4si)(__m128i) (INDEX),              \
+                               (__v4sf)(__m128) (V1), (int) (SCALE))
+
+#define _mm256_i32scatter_pd(ADDR, INDEX, V1, SCALE)                   \
+  __builtin_ia32_scattersiv4df ((void *) (ADDR), (__mmask8)0xFF,       \
+                               (__v4si)(__m128i) (INDEX),              \
+                               (__v4df)(__m256d) (V1), (int) (SCALE))
+
+#define _mm256_mask_i32scatter_pd(ADDR, MASK, INDEX, V1, SCALE)                \
+  __builtin_ia32_scattersiv4df ((void *) (ADDR), (__mmask8) (MASK),    \
+                               (__v4si)(__m128i) (INDEX),              \
+                               (__v4df)(__m256d) (V1), (int) (SCALE))
+
+#define _mm_i32scatter_pd(ADDR, INDEX, V1, SCALE)                      \
+  __builtin_ia32_scattersiv2df ((void *) (ADDR), (__mmask8)0xFF,       \
+                               (__v4si)(__m128i) (INDEX),              \
+                               (__v2df)(__m128d) (V1), (int) (SCALE))
+
+#define _mm_mask_i32scatter_pd(ADDR, MASK, INDEX, V1, SCALE)           \
+  __builtin_ia32_scattersiv2df ((void *) (ADDR), (__mmask8) (MASK),    \
+                               (__v4si)(__m128i) (INDEX),              \
+                               (__v2df)(__m128d) (V1), (int) (SCALE))
+
+#define _mm256_i64scatter_ps(ADDR, INDEX, V1, SCALE)                   \
+  __builtin_ia32_scatterdiv8sf ((void *) (ADDR), (__mmask8)0xFF,       \
+                               (__v4di)(__m256i) (INDEX),              \
+                               (__v4sf)(__m128) (V1), (int) (SCALE))
+
+#define _mm256_mask_i64scatter_ps(ADDR, MASK, INDEX, V1, SCALE)                \
+  __builtin_ia32_scatterdiv8sf ((void *) (ADDR), (__mmask8) (MASK),    \
+                               (__v4di)(__m256i) (INDEX),              \
+                               (__v4sf)(__m128) (V1), (int) (SCALE))
+
+#define _mm_i64scatter_ps(ADDR, INDEX, V1, SCALE)                      \
+  __builtin_ia32_scatterdiv4sf ((void *) (ADDR), (__mmask8)0xFF,       \
+                               (__v2di)(__m128i) (INDEX),              \
+                               (__v4sf)(__m128) (V1), (int) (SCALE))
+
+#define _mm_mask_i64scatter_ps(ADDR, MASK, INDEX, V1, SCALE)           \
+  __builtin_ia32_scatterdiv4sf ((void *) (ADDR), (__mmask8) (MASK),    \
+                               (__v2di)(__m128i) (INDEX),              \
+                               (__v4sf)(__m128) (V1), (int) (SCALE))
+
+#define _mm256_i64scatter_pd(ADDR, INDEX, V1, SCALE)                   \
+  __builtin_ia32_scatterdiv4df ((void *) (ADDR), (__mmask8)0xFF,       \
+                               (__v4di)(__m256i) (INDEX),              \
+                               (__v4df)(__m256d) (V1), (int) (SCALE))
+
+#define _mm256_mask_i64scatter_pd(ADDR, MASK, INDEX, V1, SCALE)                \
+  __builtin_ia32_scatterdiv4df ((void *) (ADDR), (__mmask8) (MASK),    \
+                               (__v4di)(__m256i) (INDEX),              \
+                               (__v4df)(__m256d) (V1), (int) (SCALE))
+
+#define _mm_i64scatter_pd(ADDR, INDEX, V1, SCALE)                      \
+  __builtin_ia32_scatterdiv2df ((void *) (ADDR), (__mmask8)0xFF,       \
+                               (__v2di)(__m128i) (INDEX),              \
+                               (__v2df)(__m128d) (V1), (int) (SCALE))
+
+#define _mm_mask_i64scatter_pd(ADDR, MASK, INDEX, V1, SCALE)           \
+  __builtin_ia32_scatterdiv2df ((void *) (ADDR), (__mmask8) (MASK),    \
+                               (__v2di)(__m128i) (INDEX),              \
+                               (__v2df)(__m128d) (V1), (int) (SCALE))
+
+#define _mm256_i32scatter_epi32(ADDR, INDEX, V1, SCALE)                        \
+  __builtin_ia32_scattersiv8si ((void *) (ADDR), (__mmask8)0xFF,       \
+                               (__v8si)(__m256i) (INDEX),              \
+                               (__v8si)(__m256i) (V1), (int) (SCALE))
+
+#define _mm256_mask_i32scatter_epi32(ADDR, MASK, INDEX, V1, SCALE)     \
+  __builtin_ia32_scattersiv8si ((void *) (ADDR), (__mmask8) (MASK),    \
+                               (__v8si)(__m256i) (INDEX),              \
+                               (__v8si)(__m256i) (V1), (int) (SCALE))
+
+#define _mm_i32scatter_epi32(ADDR, INDEX, V1, SCALE)                   \
+  __builtin_ia32_scattersiv4si ((void *) (ADDR), (__mmask8)0xFF,       \
+                               (__v4si)(__m128i) (INDEX),              \
+                               (__v4si)(__m128i) (V1), (int) (SCALE))
+
+#define _mm_mask_i32scatter_epi32(ADDR, MASK, INDEX, V1, SCALE)                \
+  __builtin_ia32_scattersiv4si ((void *) (ADDR), (__mmask8) (MASK),    \
+                               (__v4si)(__m128i) (INDEX),              \
+                               (__v4si)(__m128i) (V1), (int) (SCALE))
+
+#define _mm256_i32scatter_epi64(ADDR, INDEX, V1, SCALE)                        \
+  __builtin_ia32_scattersiv4di ((void *) (ADDR), (__mmask8)0xFF,       \
+                               (__v4si)(__m128i) (INDEX),              \
+                               (__v4di)(__m256i) (V1), (int) (SCALE))
+
+#define _mm256_mask_i32scatter_epi64(ADDR, MASK, INDEX, V1, SCALE)     \
+  __builtin_ia32_scattersiv4di ((void *) (ADDR), (__mmask8) (MASK),    \
+                               (__v4si)(__m128i) (INDEX),              \
+                               (__v4di)(__m256i) (V1), (int) (SCALE))
+
+#define _mm_i32scatter_epi64(ADDR, INDEX, V1, SCALE)                   \
+  __builtin_ia32_scattersiv2di ((void *) (ADDR), (__mmask8)0xFF,       \
+                               (__v4si)(__m128i) (INDEX),              \
+                               (__v2di)(__m128i) (V1), (int) (SCALE))
+
+#define _mm_mask_i32scatter_epi64(ADDR, MASK, INDEX, V1, SCALE)                \
+  __builtin_ia32_scattersiv2di ((void *) (ADDR), (__mmask8) (MASK),    \
+                               (__v4si)(__m128i) (INDEX),              \
+                               (__v2di)(__m128i) (V1), (int) (SCALE))
+
+#define _mm256_i64scatter_epi32(ADDR, INDEX, V1, SCALE)                        \
+  __builtin_ia32_scatterdiv8si ((void *) (ADDR), (__mmask8)0xFF,       \
+                               (__v4di)(__m256i) (INDEX),              \
+                               (__v4si)(__m128i) (V1), (int) (SCALE))
+
+#define _mm256_mask_i64scatter_epi32(ADDR, MASK, INDEX, V1, SCALE)     \
+  __builtin_ia32_scatterdiv8si ((void *) (ADDR), (__mmask8) (MASK),    \
+                               (__v4di)(__m256i) (INDEX),              \
+                               (__v4si)(__m128i) (V1), (int) (SCALE))
+
+#define _mm_i64scatter_epi32(ADDR, INDEX, V1, SCALE)                   \
+  __builtin_ia32_scatterdiv4si ((void *) (ADDR), (__mmask8)0xFF,       \
+                               (__v2di)(__m128i) (INDEX),              \
+                               (__v4si)(__m128i) (V1), (int) (SCALE))
+
+#define _mm_mask_i64scatter_epi32(ADDR, MASK, INDEX, V1, SCALE)                \
+  __builtin_ia32_scatterdiv4si ((void *) (ADDR), (__mmask8) (MASK),    \
+                               (__v2di)(__m128i) (INDEX),              \
+                               (__v4si)(__m128i) (V1), (int) (SCALE))
+
+#define _mm256_i64scatter_epi64(ADDR, INDEX, V1, SCALE)                        \
+  __builtin_ia32_scatterdiv4di ((void *) (ADDR), (__mmask8)0xFF,       \
+                               (__v4di)(__m256i) (INDEX),              \
+                               (__v4di)(__m256i) (V1), (int) (SCALE))
+
+#define _mm256_mask_i64scatter_epi64(ADDR, MASK, INDEX, V1, SCALE)     \
+  __builtin_ia32_scatterdiv4di ((void *) (ADDR), (__mmask8) (MASK),    \
+                               (__v4di)(__m256i) (INDEX),              \
+                               (__v4di)(__m256i) (V1), (int) (SCALE))
+
+#define _mm_i64scatter_epi64(ADDR, INDEX, V1, SCALE)                   \
+  __builtin_ia32_scatterdiv2di ((void *) (ADDR), (__mmask8)0xFF,       \
+                               (__v2di)(__m128i) (INDEX),              \
+                               (__v2di)(__m128i) (V1), (int) (SCALE))
+
+#define _mm_mask_i64scatter_epi64(ADDR, MASK, INDEX, V1, SCALE)                \
+  __builtin_ia32_scatterdiv2di ((void *) (ADDR), (__mmask8) (MASK),    \
+                               (__v2di)(__m128i) (INDEX),              \
+                               (__v2di)(__m128i) (V1), (int) (SCALE))
+
+#define _mm256_mask_shuffle_epi32(W, U, X, C)                                       \
+  ((__m256i)  __builtin_ia32_pshufd256_mask ((__v8si)(__m256i)(X), (int)(C),        \
+                                             (__v8si)(__m256i)(W),                  \
+                                             (__mmask8)(U)))
+
+#define _mm256_maskz_shuffle_epi32(U, X, C)                                         \
+  ((__m256i)  __builtin_ia32_pshufd256_mask ((__v8si)(__m256i)(X), (int)(C),        \
+                                            (__v8si)(__m256i)                      \
+                                            _mm256_setzero_si256 (),               \
+                                             (__mmask8)(U)))
+
+#define _mm_mask_shuffle_epi32(W, U, X, C)                                          \
+  ((__m128i)  __builtin_ia32_pshufd128_mask ((__v4si)(__m128i)(X), (int)(C),        \
+                                             (__v4si)(__m128i)(W),                  \
+                                             (__mmask8)(U)))
+
+#define _mm_maskz_shuffle_epi32(U, X, C)                                            \
+  ((__m128i)  __builtin_ia32_pshufd128_mask ((__v4si)(__m128i)(X), (int)(C),        \
+                                            (__v4si)(__m128i)_mm_setzero_si128 (), \
+                                             (__mmask8)(U)))
+
+#define _mm256_rol_epi64(A, B)                                                 \
+  ((__m256i)__builtin_ia32_prolq256_mask ((__v4di)(__m256i)(A), (int)(B),      \
+                                          (__v4di)(__m256i)_mm256_setzero_si256 (),\
+                                          (__mmask8)-1))
+
+#define _mm256_mask_rol_epi64(W, U, A, B)                                      \
+  ((__m256i)__builtin_ia32_prolq256_mask ((__v4di)(__m256i)(A), (int)(B),      \
+                                          (__v4di)(__m256i)(W),                \
+                                          (__mmask8)(U)))
+
+#define _mm256_maskz_rol_epi64(U, A, B)                                        \
+  ((__m256i)__builtin_ia32_prolq256_mask ((__v4di)(__m256i)(A), (int)(B),      \
+                                          (__v4di)(__m256i)_mm256_setzero_si256 (),\
+                                          (__mmask8)(U)))
+
+#define _mm_rol_epi64(A, B)                                                    \
+  ((__m128i)__builtin_ia32_prolq128_mask ((__v2di)(__m128i)(A), (int)(B),      \
+                                         (__v2di)(__m128i)_mm_setzero_si128 (),\
+                                          (__mmask8)-1))
+
+#define _mm_mask_rol_epi64(W, U, A, B)                                         \
+  ((__m128i)__builtin_ia32_prolq128_mask ((__v2di)(__m128i)(A), (int)(B),      \
+                                          (__v2di)(__m128i)(W),                \
+                                          (__mmask8)(U)))
+
+#define _mm_maskz_rol_epi64(U, A, B)                                           \
+  ((__m128i)__builtin_ia32_prolq128_mask ((__v2di)(__m128i)(A), (int)(B),      \
+                                         (__v2di)(__m128i)_mm_setzero_si128 (),\
+                                          (__mmask8)(U)))
+
+#define _mm256_ror_epi64(A, B)                                                 \
+  ((__m256i)__builtin_ia32_prorq256_mask ((__v4di)(__m256i)(A), (int)(B),      \
+                                          (__v4di)(__m256i)_mm256_setzero_si256 (),\
+                                          (__mmask8)-1))
+
+#define _mm256_mask_ror_epi64(W, U, A, B)                                      \
+  ((__m256i)__builtin_ia32_prorq256_mask ((__v4di)(__m256i)(A), (int)(B),      \
+                                          (__v4di)(__m256i)(W),                \
+                                          (__mmask8)(U)))
+
+#define _mm256_maskz_ror_epi64(U, A, B)                                        \
+  ((__m256i)__builtin_ia32_prorq256_mask ((__v4di)(__m256i)(A), (int)(B),      \
+                                          (__v4di)(__m256i)_mm256_setzero_si256 (),\
+                                          (__mmask8)(U)))
+
+#define _mm_ror_epi64(A, B)                                                    \
+  ((__m128i)__builtin_ia32_prorq128_mask ((__v2di)(__m128i)(A), (int)(B),      \
+                                         (__v2di)(__m128i)_mm_setzero_si128 (),\
+                                          (__mmask8)-1))
+
+#define _mm_mask_ror_epi64(W, U, A, B)                                         \
+  ((__m128i)__builtin_ia32_prorq128_mask ((__v2di)(__m128i)(A), (int)(B),      \
+                                          (__v2di)(__m128i)(W),                \
+                                          (__mmask8)(U)))
+
+#define _mm_maskz_ror_epi64(U, A, B)                                           \
+  ((__m128i)__builtin_ia32_prorq128_mask ((__v2di)(__m128i)(A), (int)(B),      \
+                                         (__v2di)(__m128i)_mm_setzero_si128 (),\
+                                          (__mmask8)(U)))
+
+#define _mm256_rol_epi32(A, B)                                                 \
+  ((__m256i)__builtin_ia32_prold256_mask ((__v8si)(__m256i)(A), (int)(B),      \
+                                         (__v8si)(__m256i)_mm256_setzero_si256 (),\
+                                          (__mmask8)-1))
+
+#define _mm256_mask_rol_epi32(W, U, A, B)                                      \
+  ((__m256i)__builtin_ia32_prold256_mask ((__v8si)(__m256i)(A), (int)(B),      \
+                                          (__v8si)(__m256i)(W),                \
+                                          (__mmask8)(U)))
+
+#define _mm256_maskz_rol_epi32(U, A, B)                                        \
+  ((__m256i)__builtin_ia32_prold256_mask ((__v8si)(__m256i)(A), (int)(B),      \
+                                         (__v8si)(__m256i)_mm256_setzero_si256 (),\
+                                          (__mmask8)(U)))
+
+#define _mm_rol_epi32(A, B)                                                    \
+  ((__m128i)__builtin_ia32_prold128_mask ((__v4si)(__m128i)(A), (int)(B),      \
+                                         (__v4si)(__m128i)_mm_setzero_si128 (),\
+                                          (__mmask8)-1))
+
+#define _mm_mask_rol_epi32(W, U, A, B)                                         \
+  ((__m128i)__builtin_ia32_prold128_mask ((__v4si)(__m128i)(A), (int)(B),      \
+                                          (__v4si)(__m128i)(W),                \
+                                          (__mmask8)(U)))
+
+#define _mm_maskz_rol_epi32(U, A, B)                                           \
+  ((__m128i)__builtin_ia32_prold128_mask ((__v4si)(__m128i)(A), (int)(B),      \
+                                         (__v4si)(__m128i)_mm_setzero_si128 (),\
+                                          (__mmask8)(U)))
+
+#define _mm256_ror_epi32(A, B)                                                 \
+  ((__m256i)__builtin_ia32_prord256_mask ((__v8si)(__m256i)(A), (int)(B),      \
+                                         (__v8si)(__m256i)_mm256_setzero_si256 (),\
+                                          (__mmask8)-1))
+
+#define _mm256_mask_ror_epi32(W, U, A, B)                                      \
+  ((__m256i)__builtin_ia32_prord256_mask ((__v8si)(__m256i)(A), (int)(B),      \
+                                          (__v8si)(__m256i)(W),                \
+                                          (__mmask8)(U)))
+
+#define _mm256_maskz_ror_epi32(U, A, B)                                        \
+  ((__m256i)__builtin_ia32_prord256_mask ((__v8si)(__m256i)(A), (int)(B),      \
+                                         (__v8si)(__m256i)                    \
+                                         _mm256_setzero_si256 (),             \
+                                          (__mmask8)(U)))
+
+#define _mm_ror_epi32(A, B)                                                    \
+  ((__m128i)__builtin_ia32_prord128_mask ((__v4si)(__m128i)(A), (int)(B),      \
+                                         (__v4si)(__m128i)_mm_setzero_si128 (),\
+                                          (__mmask8)-1))
+
+#define _mm_mask_ror_epi32(W, U, A, B)                                         \
+  ((__m128i)__builtin_ia32_prord128_mask ((__v4si)(__m128i)(A), (int)(B),      \
+                                          (__v4si)(__m128i)(W),                \
+                                          (__mmask8)(U)))
+
+#define _mm_maskz_ror_epi32(U, A, B)                                           \
+  ((__m128i)__builtin_ia32_prord128_mask ((__v4si)(__m128i)(A), (int)(B),      \
+                                         (__v4si)(__m128i)_mm_setzero_si128 (),\
+                                          (__mmask8)(U)))
+
+#define _mm256_alignr_epi32(X, Y, C)                                        \
+    ((__m256i)__builtin_ia32_alignd256_mask ((__v8si)(__m256i)(X),          \
+        (__v8si)(__m256i)(Y), (int)(C), (__v8si)(__m256i)(X), (__mmask8)-1))
+
+#define _mm256_mask_alignr_epi32(W, U, X, Y, C)                             \
+    ((__m256i)__builtin_ia32_alignd256_mask ((__v8si)(__m256i)(X),          \
+        (__v8si)(__m256i)(Y), (int)(C), (__v8si)(__m256i)(W), (__mmask8)(U)))
+
+#define _mm256_maskz_alignr_epi32(U, X, Y, C)                               \
+    ((__m256i)__builtin_ia32_alignd256_mask ((__v8si)(__m256i)(X),          \
+        (__v8si)(__m256i)(Y), (int)(C), (__v8si)(__m256i)_mm256_setzero_si256 (),\
+        (__mmask8)(U)))
+
+#define _mm256_alignr_epi64(X, Y, C)                                        \
+    ((__m256i)__builtin_ia32_alignq256_mask ((__v4di)(__m256i)(X),          \
+        (__v4di)(__m256i)(Y), (int)(C), (__v4di)(__m256i)(X), (__mmask8)-1))
+
+#define _mm256_mask_alignr_epi64(W, U, X, Y, C)                             \
+    ((__m256i)__builtin_ia32_alignq256_mask ((__v4di)(__m256i)(X),          \
+        (__v4di)(__m256i)(Y), (int)(C), (__v4di)(__m256i)(W), (__mmask8)(U)))
+
+#define _mm256_maskz_alignr_epi64(U, X, Y, C)                               \
+    ((__m256i)__builtin_ia32_alignq256_mask ((__v4di)(__m256i)(X),          \
+        (__v4di)(__m256i)(Y), (int)(C), (__v4di)(__m256i)_mm256_setzero_si256 (),\
+        (__mmask8)(U)))
+
+#define _mm_alignr_epi32(X, Y, C)                                           \
+    ((__m128i)__builtin_ia32_alignd128_mask ((__v4si)(__m128i)(X),          \
+        (__v4si)(__m128i)(Y), (int)(C), (__v4si)(__m128i)(X), (__mmask8)-1))
+
+#define _mm_mask_alignr_epi32(W, U, X, Y, C)                                \
+    ((__m128i)__builtin_ia32_alignd128_mask ((__v4si)(__m128i)(X),          \
+        (__v4si)(__m128i)(Y), (int)(C), (__v4si)(__m128i)(W), (__mmask8)(U)))
+
+#define _mm_maskz_alignr_epi32(U, X, Y, C)                                  \
+    ((__m128i)__builtin_ia32_alignd128_mask ((__v4si)(__m128i)(X),          \
+       (__v4si)(__m128i)(Y), (int)(C), (__v4si)(__m128i)_mm_setzero_si128 (),\
+        (__mmask8)(U)))
+
+#define _mm_alignr_epi64(X, Y, C)                                           \
+    ((__m128i)__builtin_ia32_alignq128_mask ((__v2di)(__m128i)(X),          \
+        (__v2di)(__m128i)(Y), (int)(C), (__v2di)(__m128i)(X), (__mmask8)-1))
+
+#define _mm_mask_alignr_epi64(W, U, X, Y, C)                                \
+    ((__m128i)__builtin_ia32_alignq128_mask ((__v2di)(__m128i)(X),          \
+        (__v2di)(__m128i)(Y), (int)(C), (__v2di)(__m128i)(X), (__mmask8)-1))
+
+#define _mm_maskz_alignr_epi64(U, X, Y, C)                                  \
+    ((__m128i)__builtin_ia32_alignq128_mask ((__v2di)(__m128i)(X),          \
+       (__v2di)(__m128i)(Y), (int)(C), (__v2di)(__m128i)_mm_setzero_si128 (),\
+        (__mmask8)(U)))
+
+#define _mm_mask_cvtps_ph(W, U, A, I)                                          \
+  ((__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf)(__m128) (A), (int) (I),   \
+      (__v8hi)(__m128i) (W), (__mmask8) (U)))
+
+#define _mm_maskz_cvtps_ph(U, A, I)                                            \
+  ((__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf)(__m128) (A), (int) (I),   \
+      (__v8hi)(__m128i) _mm_setzero_si128 (), (__mmask8) (U)))
+
+#define _mm256_mask_cvtps_ph(W, U, A, I)                                       \
+  ((__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf)(__m256) (A), (int) (I),        \
+      (__v8hi)(__m128i) (W), (__mmask8) (U)))
+
+#define _mm256_maskz_cvtps_ph(U, A, I)                                         \
+  ((__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf)(__m256) (A), (int) (I),        \
+      (__v8hi)(__m128i) _mm_setzero_si128 (), (__mmask8) (U)))
+
+#define _mm256_mask_srai_epi32(W, U, A, B)                             \
+  ((__m256i) __builtin_ia32_psradi256_mask ((__v8si)(__m256i)(A),      \
+    (int)(B), (__v8si)(__m256i)(W), (__mmask8)(U)))
+
+#define _mm256_maskz_srai_epi32(U, A, B)                               \
+  ((__m256i) __builtin_ia32_psradi256_mask ((__v8si)(__m256i)(A),      \
+    (int)(B), (__v8si)_mm256_setzero_si256 (), (__mmask8)(U)))
+
+#define _mm_mask_srai_epi32(W, U, A, B)                                 \
+  ((__m128i) __builtin_ia32_psradi128_mask ((__v4si)(__m128i)(A),       \
+    (int)(B), (__v4si)(__m128i)(W), (__mmask8)(U)))
+
+#define _mm_maskz_srai_epi32(U, A, B)                                   \
+  ((__m128i) __builtin_ia32_psradi128_mask ((__v4si)(__m128i)(A),       \
+    (int)(B), (__v4si)_mm_setzero_si128 (), (__mmask8)(U)))
+
+#define _mm256_srai_epi64(A, B)                                                \
+  ((__m256i) __builtin_ia32_psraqi256_mask ((__v4di)(__m256i)(A),      \
+    (int)(B), (__v4di)_mm256_setzero_si256 (), (__mmask8)-1))
+
+#define _mm256_mask_srai_epi64(W, U, A, B)                             \
+  ((__m256i) __builtin_ia32_psraqi256_mask ((__v4di)(__m256i)(A),      \
+    (int)(B), (__v4di)(__m256i)(W), (__mmask8)(U)))
+
+#define _mm256_maskz_srai_epi64(U, A, B)                               \
+  ((__m256i) __builtin_ia32_psraqi256_mask ((__v4di)(__m256i)(A),      \
+    (int)(B), (__v4di)_mm256_setzero_si256 (), (__mmask8)(U)))
+
+#define _mm_srai_epi64(A, B)                                           \
+  ((__m128i) __builtin_ia32_psraqi128_mask ((__v2di)(__m128i)(A),       \
+    (int)(B), (__v2di)_mm_setzero_si128 (), (__mmask8)-1))
+
+#define _mm_mask_srai_epi64(W, U, A, B)                                 \
+  ((__m128i) __builtin_ia32_psraqi128_mask ((__v2di)(__m128i)(A),       \
+    (int)(B), (__v2di)(__m128i)(W), (__mmask8)(U)))
+
+#define _mm_maskz_srai_epi64(U, A, B)                                   \
+  ((__m128i) __builtin_ia32_psraqi128_mask ((__v2di)(__m128i)(A),       \
+    (int)(B), (__v2di)_mm_setzero_si128 (), (__mmask8)(U)))
+
+#define _mm256_mask_permutex_pd(W, U, A, B)                             \
+  ((__m256d) __builtin_ia32_permdf256_mask ((__v4df)(__m256d)(A),       \
+    (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U)))
+
+#define _mm256_maskz_permutex_pd(U, A, B)                              \
+  ((__m256d) __builtin_ia32_permdf256_mask ((__v4df)(__m256d)(A),       \
+    (int)(B), (__v4df)(__m256d)_mm256_setzero_pd (), (__mmask8)(U)))
+
+#define _mm256_mask_permute_pd(W, U, X, C)                                         \
+  ((__m256d) __builtin_ia32_vpermilpd256_mask ((__v4df)(__m256d)(X), (int)(C),     \
+                                             (__v4df)(__m256d)(W),                 \
+                                             (__mmask8)(U)))
+
+#define _mm256_maskz_permute_pd(U, X, C)                                           \
+  ((__m256d) __builtin_ia32_vpermilpd256_mask ((__v4df)(__m256d)(X), (int)(C),     \
+                                             (__v4df)(__m256d)_mm256_setzero_pd (),\
+                                             (__mmask8)(U)))
+
+#define _mm256_mask_permute_ps(W, U, X, C)                                         \
+  ((__m256) __builtin_ia32_vpermilps256_mask ((__v8sf)(__m256)(X), (int)(C),       \
+                                             (__v8sf)(__m256)(W), (__mmask8)(U)))
+
+#define _mm256_maskz_permute_ps(U, X, C)                                           \
+  ((__m256) __builtin_ia32_vpermilps256_mask ((__v8sf)(__m256)(X), (int)(C),       \
+                                             (__v8sf)(__m256)_mm256_setzero_ps (), \
+                                             (__mmask8)(U)))
+
+#define _mm_mask_permute_pd(W, U, X, C)                                                    \
+  ((__m128d) __builtin_ia32_vpermilpd_mask ((__v2df)(__m128d)(X), (int)(C),        \
+                                           (__v2df)(__m128d)(W), (__mmask8)(U)))
+
+#define _mm_maskz_permute_pd(U, X, C)                                              \
+  ((__m128d) __builtin_ia32_vpermilpd_mask ((__v2df)(__m128d)(X), (int)(C),        \
+                                           (__v2df)(__m128d)_mm_setzero_pd (),     \
+                                           (__mmask8)(U)))
+
+#define _mm_mask_permute_ps(W, U, X, C)                                                    \
+  ((__m128) __builtin_ia32_vpermilps_mask ((__v4sf)(__m128)(X), (int)(C),          \
+                                         (__v4sf)(__m128)(W), (__mmask8)(U)))
+
+#define _mm_maskz_permute_ps(U, X, C)                                              \
+  ((__m128) __builtin_ia32_vpermilps_mask ((__v4sf)(__m128)(X), (int)(C),          \
+                                         (__v4sf)(__m128)_mm_setzero_ps (),        \
+                                         (__mmask8)(U)))
+
+#define _mm256_mask_blend_pd(__U, __A, __W)                          \
+  ((__m256d) __builtin_ia32_blendmpd_256_mask ((__v4df) (__A),       \
+                                                    (__v4df) (__W),  \
+                                                    (__mmask8) (__U)))
+
+#define _mm256_mask_blend_ps(__U, __A, __W)                          \
+  ((__m256) __builtin_ia32_blendmps_256_mask ((__v8sf) (__A),        \
+                                                   (__v8sf) (__W),   \
+                                                   (__mmask8) (__U)))
+
+#define _mm256_mask_blend_epi64(__U, __A, __W)                       \
+  ((__m256i) __builtin_ia32_blendmq_256_mask ((__v4di) (__A),        \
+                                                   (__v4di) (__W),   \
+                                                   (__mmask8) (__U)))
+
+#define _mm256_mask_blend_epi32(__U, __A, __W)                       \
+  ((__m256i) __builtin_ia32_blendmd_256_mask ((__v8si) (__A),        \
+                                                   (__v8si) (__W),   \
+                                                   (__mmask8) (__U)))
+
+#define _mm_mask_blend_pd(__U, __A, __W)                             \
+  ((__m128d) __builtin_ia32_blendmpd_128_mask ((__v2df) (__A),       \
+                                                    (__v2df) (__W),  \
+                                                    (__mmask8) (__U)))
+
+#define _mm_mask_blend_ps(__U, __A, __W)                             \
+  ((__m128) __builtin_ia32_blendmps_128_mask ((__v4sf) (__A),        \
+                                                   (__v4sf) (__W),   \
+                                                   (__mmask8) (__U)))
+
+#define _mm_mask_blend_epi64(__U, __A, __W)                          \
+  ((__m128i) __builtin_ia32_blendmq_128_mask ((__v2di) (__A),        \
+                                                   (__v2di) (__W),   \
+                                                   (__mmask8) (__U)))
+
+#define _mm_mask_blend_epi32(__U, __A, __W)                          \
+  ((__m128i) __builtin_ia32_blendmd_128_mask ((__v4si) (__A),        \
+                                                   (__v4si) (__W),   \
+                                                   (__mmask8) (__U)))
+
+#define _mm256_cmp_epu32_mask(X, Y, P)                                 \
+  ((__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si)(__m256i)(X),      \
+                                           (__v8si)(__m256i)(Y), (int)(P),\
+                                           (__mmask8)-1))
+
+#define _mm256_cmp_epi64_mask(X, Y, P)                                 \
+  ((__mmask8) __builtin_ia32_cmpq256_mask ((__v4di)(__m256i)(X),       \
+                                          (__v4di)(__m256i)(Y), (int)(P),\
+                                          (__mmask8)-1))
+
+#define _mm256_cmp_epi32_mask(X, Y, P)                                 \
+  ((__mmask8) __builtin_ia32_cmpd256_mask ((__v8si)(__m256i)(X),       \
+                                          (__v8si)(__m256i)(Y), (int)(P),\
+                                          (__mmask8)-1))
+
+#define _mm256_cmp_epu64_mask(X, Y, P)                                 \
+  ((__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di)(__m256i)(X),      \
+                                           (__v4di)(__m256i)(Y), (int)(P),\
+                                           (__mmask8)-1))
+
+#define _mm256_cmp_pd_mask(X, Y, P)                                    \
+  ((__mmask8) __builtin_ia32_cmppd256_mask ((__v4df)(__m256d)(X),      \
+                                           (__v4df)(__m256d)(Y), (int)(P),\
+                                           (__mmask8)-1))
+
+#define _mm256_cmp_ps_mask(X, Y, P)                                    \
+  ((__mmask8) __builtin_ia32_cmpps256_mask ((__v8sf)(__m256)(X),       \
+                                            (__v8sf)(__m256)(Y), (int)(P),\
+                                            (__mmask8)-1))
+
+#define _mm256_mask_cmp_epi64_mask(M, X, Y, P)                         \
+  ((__mmask8) __builtin_ia32_cmpq256_mask ((__v4di)(__m256i)(X),       \
+                                          (__v4di)(__m256i)(Y), (int)(P),\
+                                          (__mmask8)(M)))
+
+#define _mm256_mask_cmp_epi32_mask(M, X, Y, P)                         \
+  ((__mmask8) __builtin_ia32_cmpd256_mask ((__v8si)(__m256i)(X),       \
+                                          (__v8si)(__m256i)(Y), (int)(P),\
+                                          (__mmask8)(M)))
+
+#define _mm256_mask_cmp_epu64_mask(M, X, Y, P)                         \
+  ((__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di)(__m256i)(X),      \
+                                           (__v4di)(__m256i)(Y), (int)(P),\
+                                           (__mmask8)(M)))
+
+#define _mm256_mask_cmp_epu32_mask(M, X, Y, P)                         \
+  ((__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si)(__m256i)(X),      \
+                                           (__v8si)(__m256i)(Y), (int)(P),\
+                                           (__mmask8)(M)))
+
+#define _mm256_mask_cmp_pd_mask(M, X, Y, P)                            \
+  ((__mmask8) __builtin_ia32_cmppd256_mask ((__v4df)(__m256d)(X),      \
+                                           (__v4df)(__m256d)(Y), (int)(P),\
+                                           (__mmask8)(M)))
+
+#define _mm256_mask_cmp_ps_mask(M, X, Y, P)                            \
+  ((__mmask8) __builtin_ia32_cmpps256_mask ((__v8sf)(__m256)(X),       \
+                                            (__v8sf)(__m256)(Y), (int)(P),\
+                                            (__mmask8)(M)))
+
+#define _mm_cmp_epi64_mask(X, Y, P)                                    \
+  ((__mmask8) __builtin_ia32_cmpq128_mask ((__v2di)(__m128i)(X),       \
+                                          (__v2di)(__m128i)(Y), (int)(P),\
+                                          (__mmask8)-1))
+
+#define _mm_cmp_epi32_mask(X, Y, P)                                    \
+  ((__mmask8) __builtin_ia32_cmpd128_mask ((__v4si)(__m128i)(X),       \
+                                          (__v4si)(__m128i)(Y), (int)(P),\
+                                          (__mmask8)-1))
+
+#define _mm_cmp_epu64_mask(X, Y, P)                                    \
+  ((__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di)(__m128i)(X),      \
+                                           (__v2di)(__m128i)(Y), (int)(P),\
+                                           (__mmask8)-1))
+
+#define _mm_cmp_epu32_mask(X, Y, P)                                    \
+  ((__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si)(__m128i)(X),      \
+                                           (__v4si)(__m128i)(Y), (int)(P),\
+                                           (__mmask8)-1))
+
+#define _mm_cmp_pd_mask(X, Y, P)                                       \
+  ((__mmask8) __builtin_ia32_cmppd128_mask ((__v2df)(__m128d)(X),      \
+                                           (__v2df)(__m128d)(Y), (int)(P),\
+                                           (__mmask8)-1))
+
+#define _mm_cmp_ps_mask(X, Y, P)                                       \
+  ((__mmask8) __builtin_ia32_cmpps128_mask ((__v4sf)(__m128)(X),       \
+                                            (__v4sf)(__m128)(Y), (int)(P),\
+                                            (__mmask8)-1))
+
+#define _mm_mask_cmp_epi64_mask(M, X, Y, P)                            \
+  ((__mmask8) __builtin_ia32_cmpq128_mask ((__v2di)(__m128i)(X),       \
+                                          (__v2di)(__m128i)(Y), (int)(P),\
+                                          (__mmask8)(M)))
+
+#define _mm_mask_cmp_epi32_mask(M, X, Y, P)                            \
+  ((__mmask8) __builtin_ia32_cmpd128_mask ((__v4si)(__m128i)(X),       \
+                                          (__v4si)(__m128i)(Y), (int)(P),\
+                                          (__mmask8)(M)))
+
+#define _mm_mask_cmp_epu64_mask(M, X, Y, P)                            \
+  ((__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di)(__m128i)(X),      \
+                                           (__v2di)(__m128i)(Y), (int)(P),\
+                                           (__mmask8)(M)))
+
+#define _mm_mask_cmp_epu32_mask(M, X, Y, P)                            \
+  ((__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si)(__m128i)(X),      \
+                                           (__v4si)(__m128i)(Y), (int)(P),\
+                                           (__mmask8)(M)))
+
+#define _mm_mask_cmp_pd_mask(M, X, Y, P)                               \
+  ((__mmask8) __builtin_ia32_cmppd128_mask ((__v2df)(__m128d)(X),      \
+                                           (__v2df)(__m128d)(Y), (int)(P),\
+                                           (__mmask8)(M)))
+
+#define _mm_mask_cmp_ps_mask(M, X, Y, P)                               \
+  ((__mmask8) __builtin_ia32_cmpps128_mask ((__v4sf)(__m128)(X),       \
+                                            (__v4sf)(__m128)(Y), (int)(P),\
+                                            (__mmask8)(M)))
+
+#endif
+
+#define _mm256_permutexvar_ps(A, B)    _mm256_permutevar8x32_ps ((B), (A))
+#define _mm256_mask_cvt_roundps_ph(A, B, C, D) \
+  _mm256_mask_cvtps_ph ((A), (B), (C), (D))
+#define _mm256_maskz_cvt_roundps_ph(A, B, C)   \
+  _mm256_maskz_cvtps_ph ((A), (B), (C))
+#define _mm_mask_cvt_roundps_ph(A, B, C, D)    \
+  _mm_mask_cvtps_ph ((A), (B), (C), (D))
+#define _mm_maskz_cvt_roundps_ph(A, B, C) _mm_maskz_cvtps_ph ((A), (B), (C))
+
+#ifdef __DISABLE_AVX512VL__
+#undef __DISABLE_AVX512VL__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512VL__ */
+
+#endif /* _AVX512VLINTRIN_H_INCLUDED */
diff --git a/include-gcc/avx512vnniintrin.h b/include-gcc/avx512vnniintrin.h
new file mode 100644 (file)
index 0000000..e36e2e5
--- /dev/null
@@ -0,0 +1,144 @@
+/* Copyright (C) 2013-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512vnniintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX512VNNIINTRIN_H_INCLUDED
+#define __AVX512VNNIINTRIN_H_INCLUDED
+
+#if !defined(__AVX512VNNI__)
+#pragma GCC push_options
+#pragma GCC target("avx512vnni")
+#define __DISABLE_AVX512VNNI__
+#endif /* __AVX512VNNI__ */
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_dpbusd_epi32 (__m512i __A, __m512i __B, __m512i __C)
+{
+  return (__m512i) __builtin_ia32_vpdpbusd_v16si ((__v16si)__A, (__v16si) __B,
+                                                               (__v16si) __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_dpbusd_epi32 (__m512i __A, __mmask16 __B, __m512i __C, __m512i __D)
+{
+  return (__m512i)__builtin_ia32_vpdpbusd_v16si_mask ((__v16si)__A,
+                               (__v16si) __C, (__v16si) __D, (__mmask16)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_dpbusd_epi32 (__mmask16 __A, __m512i __B, __m512i __C,
+                                                       __m512i __D)
+{
+  return (__m512i)__builtin_ia32_vpdpbusd_v16si_maskz ((__v16si)__B,
+                               (__v16si) __C, (__v16si) __D, (__mmask16)__A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_dpbusds_epi32 (__m512i __A, __m512i __B, __m512i __C)
+{
+  return (__m512i) __builtin_ia32_vpdpbusds_v16si ((__v16si)__A, (__v16si) __B,
+                                                        (__v16si) __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_dpbusds_epi32 (__m512i __A, __mmask16 __B, __m512i __C,
+                                                       __m512i __D)
+{
+  return (__m512i)__builtin_ia32_vpdpbusds_v16si_mask ((__v16si)__A,
+                               (__v16si) __C, (__v16si) __D, (__mmask16)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_dpbusds_epi32 (__mmask16 __A, __m512i __B, __m512i __C,
+                                                       __m512i __D)
+{
+  return (__m512i)__builtin_ia32_vpdpbusds_v16si_maskz ((__v16si)__B,
+                               (__v16si) __C, (__v16si) __D, (__mmask16)__A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_dpwssd_epi32 (__m512i __A, __m512i __B, __m512i __C)
+{
+  return (__m512i) __builtin_ia32_vpdpwssd_v16si ((__v16si)__A, (__v16si) __B,
+                                                               (__v16si) __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_dpwssd_epi32 (__m512i __A, __mmask16 __B, __m512i __C, __m512i __D)
+{
+  return (__m512i)__builtin_ia32_vpdpwssd_v16si_mask ((__v16si)__A,
+                               (__v16si) __C, (__v16si) __D, (__mmask16)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_dpwssd_epi32 (__mmask16 __A, __m512i __B, __m512i __C,
+                                                       __m512i __D)
+{
+  return (__m512i)__builtin_ia32_vpdpwssd_v16si_maskz ((__v16si)__B,
+                               (__v16si) __C, (__v16si) __D, (__mmask16)__A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_dpwssds_epi32 (__m512i __A, __m512i __B, __m512i __C)
+{
+  return (__m512i) __builtin_ia32_vpdpwssds_v16si ((__v16si)__A, (__v16si) __B,
+                                                               (__v16si) __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_dpwssds_epi32 (__m512i __A, __mmask16 __B, __m512i __C,
+                                                       __m512i __D)
+{
+  return (__m512i)__builtin_ia32_vpdpwssds_v16si_mask ((__v16si)__A,
+                               (__v16si) __C, (__v16si) __D, (__mmask16)__B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_dpwssds_epi32 (__mmask16 __A, __m512i __B, __m512i __C,
+                                                       __m512i __D)
+{
+  return (__m512i)__builtin_ia32_vpdpwssds_v16si_maskz ((__v16si)__B,
+                               (__v16si) __C, (__v16si) __D, (__mmask16)__A);
+}
+
+#ifdef __DISABLE_AVX512VNNI__
+#undef __DISABLE_AVX512VNNI__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512VNNI__ */
+
+#endif /* __AVX512VNNIINTRIN_H_INCLUDED */
diff --git a/include-gcc/avx512vnnivlintrin.h b/include-gcc/avx512vnnivlintrin.h
new file mode 100644 (file)
index 0000000..c62a6e8
--- /dev/null
@@ -0,0 +1,210 @@
+/* Copyright (C) 2013-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512vnnivlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512VNNIVLINTRIN_H_INCLUDED
+#define _AVX512VNNIVLINTRIN_H_INCLUDED
+
+#if !defined(__AVX512VL__) || !defined(__AVX512VNNI__)
+#pragma GCC push_options
+#pragma GCC target("avx512vnni,avx512vl")
+#define __DISABLE_AVX512VNNIVL__
+#endif /* __AVX512VNNIVL__ */
+
+#define _mm256_dpbusd_epi32(A, B, C)                           \
+  ((__m256i) __builtin_ia32_vpdpbusd_v8si ((__v8si) (A),       \
+                                          (__v8si) (B),        \
+                                          (__v8si) (C)))
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_dpbusd_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D)
+{
+  return (__m256i)__builtin_ia32_vpdpbusd_v8si_mask ((__v8si)__A, (__v8si) __C,
+                                               (__v8si) __D, (__mmask8)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_dpbusd_epi32 (__mmask8 __A, __m256i __B, __m256i __C, __m256i __D)
+{
+  return (__m256i)__builtin_ia32_vpdpbusd_v8si_maskz ((__v8si)__B,
+                               (__v8si) __C, (__v8si) __D, (__mmask8)__A);
+}
+
+#define _mm_dpbusd_epi32(A, B, C)                              \
+  ((__m128i) __builtin_ia32_vpdpbusd_v4si ((__v4si) (A),       \
+                                          (__v4si) (B),        \
+                                          (__v4si) (C)))
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_dpbusd_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D)
+{
+  return (__m128i)__builtin_ia32_vpdpbusd_v4si_mask ((__v4si)__A, (__v4si) __C,
+                                               (__v4si) __D, (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_dpbusd_epi32 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D)
+{
+  return (__m128i)__builtin_ia32_vpdpbusd_v4si_maskz ((__v4si)__B,
+                               (__v4si) __C, (__v4si) __D, (__mmask8)__A);
+}
+
+#define _mm256_dpbusds_epi32(A, B, C)                          \
+  ((__m256i) __builtin_ia32_vpdpbusds_v8si ((__v8si) (A),      \
+                                           (__v8si) (B),       \
+                                           (__v8si) (C)))
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_dpbusds_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D)
+{
+  return (__m256i)__builtin_ia32_vpdpbusds_v8si_mask ((__v8si)__A,
+                               (__v8si) __C, (__v8si) __D, (__mmask8)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_dpbusds_epi32 (__mmask8 __A, __m256i __B, __m256i __C,
+                                                               __m256i __D)
+{
+  return (__m256i)__builtin_ia32_vpdpbusds_v8si_maskz ((__v8si)__B,
+                               (__v8si) __C, (__v8si) __D, (__mmask8)__A);
+}
+
+#define _mm_dpbusds_epi32(A, B, C)                             \
+  ((__m128i) __builtin_ia32_vpdpbusds_v4si ((__v4si) (A),      \
+                                           (__v4si) (B),       \
+                                           (__v4si) (C)))
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_dpbusds_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D)
+{
+  return (__m128i)__builtin_ia32_vpdpbusds_v4si_mask ((__v4si)__A,
+                               (__v4si) __C, (__v4si) __D, (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_dpbusds_epi32 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D)
+{
+  return (__m128i)__builtin_ia32_vpdpbusds_v4si_maskz ((__v4si)__B,
+                               (__v4si) __C, (__v4si) __D, (__mmask8)__A);
+}
+
+#define _mm256_dpwssd_epi32(A, B, C)                           \
+  ((__m256i) __builtin_ia32_vpdpwssd_v8si ((__v8si) (A),       \
+                                          (__v8si) (B),        \
+                                          (__v8si) (C)))
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_dpwssd_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D)
+{
+  return (__m256i)__builtin_ia32_vpdpwssd_v8si_mask ((__v8si)__A, (__v8si) __C,
+                                               (__v8si) __D, (__mmask8)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_dpwssd_epi32 (__mmask8 __A, __m256i __B, __m256i __C, __m256i __D)
+{
+  return (__m256i)__builtin_ia32_vpdpwssd_v8si_maskz ((__v8si)__B,
+                               (__v8si) __C, (__v8si) __D, (__mmask8)__A);
+}
+
+#define _mm_dpwssd_epi32(A, B, C)                              \
+  ((__m128i) __builtin_ia32_vpdpwssd_v4si ((__v4si) (A),       \
+                                          (__v4si) (B),        \
+                                          (__v4si) (C)))
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_dpwssd_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D)
+{
+  return (__m128i)__builtin_ia32_vpdpwssd_v4si_mask ((__v4si)__A, (__v4si) __C,
+                                               (__v4si) __D, (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_dpwssd_epi32 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D)
+{
+  return (__m128i)__builtin_ia32_vpdpwssd_v4si_maskz ((__v4si)__B,
+                               (__v4si) __C, (__v4si) __D, (__mmask8)__A);
+}
+
+#define _mm256_dpwssds_epi32(A, B, C)                          \
+  ((__m256i) __builtin_ia32_vpdpwssds_v8si ((__v8si) (A),      \
+                                           (__v8si) (B),       \
+                                           (__v8si) (C)))
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_dpwssds_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D)
+{
+  return (__m256i)__builtin_ia32_vpdpwssds_v8si_mask ((__v8si)__A,
+                               (__v8si) __C, (__v8si) __D, (__mmask8)__B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_dpwssds_epi32 (__mmask8 __A, __m256i __B, __m256i __C,
+                                                       __m256i __D)
+{
+  return (__m256i)__builtin_ia32_vpdpwssds_v8si_maskz ((__v8si)__B,
+                               (__v8si) __C, (__v8si) __D, (__mmask8)__A);
+}
+
+#define _mm_dpwssds_epi32(A, B, C)                             \
+  ((__m128i) __builtin_ia32_vpdpwssds_v4si ((__v4si) (A),      \
+                                           (__v4si) (B),       \
+                                           (__v4si) (C)))
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_dpwssds_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D)
+{
+  return (__m128i)__builtin_ia32_vpdpwssds_v4si_mask ((__v4si)__A,
+                               (__v4si) __C, (__v4si) __D, (__mmask8)__B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_dpwssds_epi32 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D)
+{
+  return (__m128i)__builtin_ia32_vpdpwssds_v4si_maskz ((__v4si)__B,
+                               (__v4si) __C, (__v4si) __D, (__mmask8)__A);
+}
+#ifdef __DISABLE_AVX512VNNIVL__
+#undef __DISABLE_AVX512VNNIVL__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512VNNIVL__ */
+#endif /* __DISABLE_AVX512VNNIVL__ */
diff --git a/include-gcc/avx512vp2intersectintrin.h b/include-gcc/avx512vp2intersectintrin.h
new file mode 100644 (file)
index 0000000..65e2fb1
--- /dev/null
@@ -0,0 +1,58 @@
+/* Copyright (C) 2019-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512vp2intersectintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512VP2INTERSECTINTRIN_H_INCLUDED
+#define _AVX512VP2INTERSECTINTRIN_H_INCLUDED
+
+#if !defined(__AVX512VP2INTERSECT__)
+#pragma GCC push_options
+#pragma GCC target("avx512vp2intersect")
+#define __DISABLE_AVX512VP2INTERSECT__
+#endif /* __AVX512VP2INTERSECT__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_2intersect_epi32 (__m512i __A, __m512i __B, __mmask16 *__U,
+                        __mmask16 *__M)
+{
+  __builtin_ia32_2intersectd512 (__U, __M, (__v16si) __A, (__v16si) __B);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_2intersect_epi64 (__m512i __A, __m512i __B, __mmask8 *__U,
+                        __mmask8 *__M)
+{
+  __builtin_ia32_2intersectq512 (__U, __M, (__v8di) __A, (__v8di) __B);
+}
+
+#ifdef __DISABLE_AVX512VP2INTERSECT__
+#undef __DISABLE_AVX512VP2INTERSECT__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512VP2INTERSECT__ */
+
+#endif /* _AVX512VP2INTERSECTINTRIN_H_INCLUDED */
diff --git a/include-gcc/avx512vp2intersectvlintrin.h b/include-gcc/avx512vp2intersectvlintrin.h
new file mode 100644 (file)
index 0000000..ce68aee
--- /dev/null
@@ -0,0 +1,72 @@
+/* Copyright (C) 2019-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+#error "Never use <avx512vp2intersectintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512VP2INTERSECTVLINTRIN_H_INCLUDED
+#define _AVX512VP2INTERSECTVLINTRIN_H_INCLUDED
+
+#if !defined(__AVX512VP2INTERSECT__) || !defined(__AVX512VL__)
+#pragma GCC push_options
+#pragma GCC target("avx512vp2intersect,avx512vl")
+#define __DISABLE_AVX512VP2INTERSECTVL__
+#endif /* __AVX512VP2INTERSECTVL__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_2intersect_epi32 (__m128i __A, __m128i __B, __mmask8 *__U, __mmask8 *__M)
+{
+  __builtin_ia32_2intersectd128 (__U, __M, (__v4si) __A, (__v4si) __B);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_2intersect_epi32 (__m256i __A, __m256i __B, __mmask8 *__U,
+                        __mmask8 *__M)
+{
+  __builtin_ia32_2intersectd256 (__U, __M, (__v8si) __A, (__v8si) __B);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_2intersect_epi64 (__m128i __A, __m128i __B, __mmask8 *__U, __mmask8 *__M)
+{
+  __builtin_ia32_2intersectq128 (__U, __M, (__v2di) __A, (__v2di) __B);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_2intersect_epi64 (__m256i __A, __m256i __B, __mmask8 *__U,
+                        __mmask8 *__M)
+{
+  __builtin_ia32_2intersectq256 (__U, __M, (__v4di) __A, (__v4di) __B);
+}
+
+#ifdef __DISABLE_AVX512VP2INTERSECTVL__
+#undef __DISABLE_AVX512VP2INTERSECTVL__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512VP2INTERSECTVL__ */
+
+#endif /* _AVX512VP2INTERSECTVLINTRIN_H_INCLUDED */
diff --git a/include-gcc/avx512vpopcntdqintrin.h b/include-gcc/avx512vpopcntdqintrin.h
new file mode 100644 (file)
index 0000000..47897fb
--- /dev/null
@@ -0,0 +1,94 @@
+/* Copyright (C) 2017-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+# error "Never use <avx512vpopcntdqintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _AVX512VPOPCNTDQINTRIN_H_INCLUDED
+#define _AVX512VPOPCNTDQINTRIN_H_INCLUDED
+
+#ifndef __AVX512VPOPCNTDQ__
+#pragma GCC push_options
+#pragma GCC target("avx512vpopcntdq")
+#define __DISABLE_AVX512VPOPCNTDQ__
+#endif /* __AVX512VPOPCNTDQ__ */
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_popcnt_epi32 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpopcountd_v16si ((__v16si) __A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_popcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpopcountd_v16si_mask ((__v16si) __A,
+                                                        (__v16si) __W,
+                                                        (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_popcnt_epi32 (__mmask16 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpopcountd_v16si_mask ((__v16si) __A,
+                                                        (__v16si)
+                                                        _mm512_setzero_si512 (),
+                                                        (__mmask16) __U);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_popcnt_epi64 (__m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpopcountq_v8di ((__v8di) __A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_popcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpopcountq_v8di_mask ((__v8di) __A,
+                                                       (__v8di) __W,
+                                                       (__mmask8) __U);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_popcnt_epi64 (__mmask8 __U, __m512i __A)
+{
+  return (__m512i) __builtin_ia32_vpopcountq_v8di_mask ((__v8di) __A,
+                                                       (__v8di)
+                                                       _mm512_setzero_si512 (),
+                                                       (__mmask8) __U);
+}
+
+#ifdef __DISABLE_AVX512VPOPCNTDQ__
+#undef __DISABLE_AVX512VPOPCNTDQ__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512VPOPCNTDQ__ */
+
+#endif /* _AVX512VPOPCNTDQINTRIN_H_INCLUDED */
diff --git a/include-gcc/avx512vpopcntdqvlintrin.h b/include-gcc/avx512vpopcntdqvlintrin.h
new file mode 100644 (file)
index 0000000..972ab3b
--- /dev/null
@@ -0,0 +1,146 @@
+/* Copyright (C) 2017-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+# error "Never use <avx512vpopcntdqvlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVX512VPOPCNTDQVLINTRIN_H_INCLUDED
+#define _AVX512VPOPCNTDQVLINTRIN_H_INCLUDED
+
+#if !defined(__AVX512VPOPCNTDQ__) || !defined(__AVX512VL__)
+#pragma GCC push_options
+#pragma GCC target("avx512vpopcntdq,avx512vl")
+#define __DISABLE_AVX512VPOPCNTDQVL__
+#endif /* __AVX512VPOPCNTDQVL__ */
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_popcnt_epi32 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpopcountd_v4si ((__v4si) __A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_popcnt_epi32 (__m128i __W, __mmask16 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpopcountd_v4si_mask ((__v4si) __A,
+                                                        (__v4si) __W,
+                                                        (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_popcnt_epi32 (__mmask16 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpopcountd_v4si_mask ((__v4si) __A,
+                                                        (__v4si)
+                                                        _mm_setzero_si128 (),
+                                                        (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_popcnt_epi32 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpopcountd_v8si ((__v8si) __A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_popcnt_epi32 (__m256i __W, __mmask16 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpopcountd_v8si_mask ((__v8si) __A,
+                                                        (__v8si) __W,
+                                                        (__mmask16) __U);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_popcnt_epi32 (__mmask16 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpopcountd_v8si_mask ((__v8si) __A,
+                                               (__v8si)
+                                               _mm256_setzero_si256 (),
+                                               (__mmask16) __U);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_popcnt_epi64 (__m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpopcountq_v2di ((__v2di) __A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_popcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpopcountq_v2di_mask ((__v2di) __A,
+                                                       (__v2di) __W,
+                                                       (__mmask8) __U);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_popcnt_epi64 (__mmask8 __U, __m128i __A)
+{
+  return (__m128i) __builtin_ia32_vpopcountq_v2di_mask ((__v2di) __A,
+                                                       (__v2di)
+                                                       _mm_setzero_si128 (),
+                                                       (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_popcnt_epi64 (__m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpopcountq_v4di ((__v4di) __A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_popcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpopcountq_v4di_mask ((__v4di) __A,
+                                                       (__v4di) __W,
+                                                       (__mmask8) __U);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_popcnt_epi64 (__mmask8 __U, __m256i __A)
+{
+  return (__m256i) __builtin_ia32_vpopcountq_v4di_mask ((__v4di) __A,
+                                               (__v4di)
+                                               _mm256_setzero_si256 (),
+                                               (__mmask8) __U);
+}
+
+#ifdef __DISABLE_AVX512VPOPCNTDQVL__
+#undef __DISABLE_AVX512VPOPCNTDQVL__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX512VPOPCNTDQVL__ */
+
+#endif /* _AVX512VPOPCNTDQVLINTRIN_H_INCLUDED */
diff --git a/include-gcc/avxifmaintrin.h b/include-gcc/avxifmaintrin.h
new file mode 100644 (file)
index 0000000..076cc9f
--- /dev/null
@@ -0,0 +1,78 @@
+/* Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avxifmaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVXIFMAINTRIN_H_INCLUDED
+#define _AVXIFMAINTRIN_H_INCLUDED
+
+#ifndef __AVXIFMA__
+#pragma GCC push_options
+#pragma GCC target("avxifma")
+#define __DISABLE_AVXIFMA__
+#endif /* __AVXIFMA__ */
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_madd52lo_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
+{
+  return (__m128i) __builtin_ia32_vpmadd52luq128 ((__v2di) __X,
+                                                 (__v2di) __Y,
+                                                 (__v2di) __Z);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_madd52hi_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
+{
+  return (__m128i) __builtin_ia32_vpmadd52huq128 ((__v2di) __X,
+                                                 (__v2di) __Y,
+                                                 (__v2di) __Z);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_madd52lo_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
+{
+  return (__m256i) __builtin_ia32_vpmadd52luq256 ((__v4di) __X,
+                                                 (__v4di) __Y,
+                                                 (__v4di) __Z);
+}
+
+extern __inline __m256i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_madd52hi_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
+{
+  return (__m256i) __builtin_ia32_vpmadd52huq256 ((__v4di) __X,
+                                                 (__v4di) __Y,
+                                                 (__v4di) __Z);
+}
+
+#ifdef __DISABLE_AVXIFMA__
+#undef __DISABLE_AVXIFMA__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVXIFMA__ */
+
+#endif /* _AVXIFMAINTRIN_H_INCLUDED */
diff --git a/include-gcc/avxintrin.h b/include-gcc/avxintrin.h
new file mode 100644 (file)
index 0000000..a4166bf
--- /dev/null
@@ -0,0 +1,1607 @@
+/* Copyright (C) 2008-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+   User Guide and Reference, version 11.0.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+# error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVXINTRIN_H_INCLUDED
+#define _AVXINTRIN_H_INCLUDED
+
+#ifndef __AVX__
+#pragma GCC push_options
+#pragma GCC target("avx")
+#define __DISABLE_AVX__
+#endif /* __AVX__ */
+
+/* Internal data types for implementing the intrinsics.  */
+typedef double __v4df __attribute__ ((__vector_size__ (32)));
+typedef float __v8sf __attribute__ ((__vector_size__ (32)));
+typedef long long __v4di __attribute__ ((__vector_size__ (32)));
+typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
+typedef int __v8si __attribute__ ((__vector_size__ (32)));
+typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
+typedef short __v16hi __attribute__ ((__vector_size__ (32)));
+typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
+typedef char __v32qi __attribute__ ((__vector_size__ (32)));
+typedef signed char __v32qs __attribute__ ((__vector_size__ (32)));
+typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+   vector types, and their scalar components.  */
+typedef float __m256 __attribute__ ((__vector_size__ (32),
+                                    __may_alias__));
+typedef long long __m256i __attribute__ ((__vector_size__ (32),
+                                         __may_alias__));
+typedef double __m256d __attribute__ ((__vector_size__ (32),
+                                      __may_alias__));
+
+/* Unaligned version of the same types.  */
+typedef float __m256_u __attribute__ ((__vector_size__ (32),
+                                      __may_alias__,
+                                      __aligned__ (1)));
+typedef long long __m256i_u __attribute__ ((__vector_size__ (32),
+                                           __may_alias__,
+                                           __aligned__ (1)));
+typedef double __m256d_u __attribute__ ((__vector_size__ (32),
+                                        __may_alias__,
+                                        __aligned__ (1)));
+
+/* Compare predicates for scalar and packed compare intrinsics.  */
+
+/* Equal (ordered, non-signaling)  */
+#define _CMP_EQ_OQ     0x00
+/* Less-than (ordered, signaling)  */
+#define _CMP_LT_OS     0x01
+/* Less-than-or-equal (ordered, signaling)  */
+#define _CMP_LE_OS     0x02
+/* Unordered (non-signaling)  */
+#define _CMP_UNORD_Q   0x03
+/* Not-equal (unordered, non-signaling)  */
+#define _CMP_NEQ_UQ    0x04
+/* Not-less-than (unordered, signaling)  */
+#define _CMP_NLT_US    0x05
+/* Not-less-than-or-equal (unordered, signaling)  */
+#define _CMP_NLE_US    0x06
+/* Ordered (nonsignaling)   */
+#define _CMP_ORD_Q     0x07
+/* Equal (unordered, non-signaling)  */
+#define _CMP_EQ_UQ     0x08
+/* Not-greater-than-or-equal (unordered, signaling)  */
+#define _CMP_NGE_US    0x09
+/* Not-greater-than (unordered, signaling)  */
+#define _CMP_NGT_US    0x0a
+/* False (ordered, non-signaling)  */
+#define _CMP_FALSE_OQ  0x0b
+/* Not-equal (ordered, non-signaling)  */
+#define _CMP_NEQ_OQ    0x0c
+/* Greater-than-or-equal (ordered, signaling)  */
+#define _CMP_GE_OS     0x0d
+/* Greater-than (ordered, signaling)  */
+#define _CMP_GT_OS     0x0e
+/* True (unordered, non-signaling)  */
+#define _CMP_TRUE_UQ   0x0f
+/* Equal (ordered, signaling)  */
+#define _CMP_EQ_OS     0x10
+/* Less-than (ordered, non-signaling)  */
+#define _CMP_LT_OQ     0x11
+/* Less-than-or-equal (ordered, non-signaling)  */
+#define _CMP_LE_OQ     0x12
+/* Unordered (signaling)  */
+#define _CMP_UNORD_S   0x13
+/* Not-equal (unordered, signaling)  */
+#define _CMP_NEQ_US    0x14
+/* Not-less-than (unordered, non-signaling)  */
+#define _CMP_NLT_UQ    0x15
+/* Not-less-than-or-equal (unordered, non-signaling)  */
+#define _CMP_NLE_UQ    0x16
+/* Ordered (signaling)  */
+#define _CMP_ORD_S     0x17
+/* Equal (unordered, signaling)  */
+#define _CMP_EQ_US     0x18
+/* Not-greater-than-or-equal (unordered, non-signaling)  */
+#define _CMP_NGE_UQ    0x19
+/* Not-greater-than (unordered, non-signaling)  */
+#define _CMP_NGT_UQ    0x1a
+/* False (ordered, signaling)  */
+#define _CMP_FALSE_OS  0x1b
+/* Not-equal (ordered, signaling)  */
+#define _CMP_NEQ_OS    0x1c
+/* Greater-than-or-equal (ordered, non-signaling)  */
+#define _CMP_GE_OQ     0x1d
+/* Greater-than (ordered, non-signaling)  */
+#define _CMP_GT_OQ     0x1e
+/* True (unordered, signaling)  */
+#define _CMP_TRUE_US   0x1f
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_add_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) ((__v4df)__A + (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_add_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) ((__v8sf)__A + (__v8sf)__B);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_addsub_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_addsub_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_and_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_and_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_andnot_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_andnot_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+/* Double/single precision floating point blend instructions - select
+   data from 2 sources using constant/variable mask.  */
+
+#ifdef __OPTIMIZE__
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_blend_pd (__m256d __X, __m256d __Y, const int __M)
+{
+  return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X,
+                                             (__v4df)__Y,
+                                             __M);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_blend_ps (__m256 __X, __m256 __Y, const int __M)
+{
+  return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X,
+                                            (__v8sf)__Y,
+                                            __M);
+}
+#else
+#define _mm256_blend_pd(X, Y, M)                                       \
+  ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X),          \
+                                       (__v4df)(__m256d)(Y), (int)(M)))
+
+#define _mm256_blend_ps(X, Y, M)                                       \
+  ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X),            \
+                                      (__v8sf)(__m256)(Y), (int)(M)))
+#endif
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M)
+{
+  return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X,
+                                              (__v4df)__Y,
+                                              (__v4df)__M);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M)
+{
+  return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X,
+                                             (__v8sf)__Y,
+                                             (__v8sf)__M);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_div_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) ((__v4df)__A / (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_div_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) ((__v8sf)__A / (__v8sf)__B);
+}
+
+/* Dot product instructions with mask-defined summing and zeroing parts
+   of result.  */
+
+#ifdef __OPTIMIZE__
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_dp_ps (__m256 __X, __m256 __Y, const int __M)
+{
+  return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X,
+                                         (__v8sf)__Y,
+                                         __M);
+}
+#else
+#define _mm256_dp_ps(X, Y, M)                                          \
+  ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X),               \
+                                   (__v8sf)(__m256)(Y), (int)(M)))
+#endif
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hadd_pd (__m256d __X, __m256d __Y)
+{
+  return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hadd_ps (__m256 __X, __m256 __Y)
+{
+  return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hsub_pd (__m256d __X, __m256d __Y)
+{
+  return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hsub_ps (__m256 __X, __m256 __Y)
+{
+  return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mul_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) ((__v4df)__A * (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mul_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) ((__v8sf)__A * (__v8sf)__B);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_or_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_or_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask)
+{
+  return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B,
+                                            __mask);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask)
+{
+  return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B,
+                                           __mask);
+}
+#else
+#define _mm256_shuffle_pd(A, B, N)                                     \
+  ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A),            \
+                                     (__v4df)(__m256d)(B), (int)(N)))
+
+#define _mm256_shuffle_ps(A, B, N)                                     \
+  ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A),             \
+                                     (__v8sf)(__m256)(B), (int)(N)))
+#endif
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sub_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) ((__v4df)__A - (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sub_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) ((__v8sf)__A - (__v8sf)__B);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_xor_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_xor_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_pd (__m128d __X, __m128d __Y, const int __P)
+{
+  return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_ps (__m128 __X, __m128 __Y, const int __P)
+{
+  return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P)
+{
+  return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y,
+                                           __P);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P)
+{
+  return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y,
+                                          __P);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_sd (__m128d __X, __m128d __Y, const int __P)
+{
+  return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_ss (__m128 __X, __m128 __Y, const int __P)
+{
+  return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P);
+}
+#else
+#define _mm_cmp_pd(X, Y, P)                                            \
+  ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X),               \
+                                  (__v2df)(__m128d)(Y), (int)(P)))
+
+#define _mm_cmp_ps(X, Y, P)                                            \
+  ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X),                 \
+                                 (__v4sf)(__m128)(Y), (int)(P)))
+
+#define _mm256_cmp_pd(X, Y, P)                                         \
+  ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X),            \
+                                     (__v4df)(__m256d)(Y), (int)(P)))
+
+#define _mm256_cmp_ps(X, Y, P)                                         \
+  ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X),              \
+                                    (__v8sf)(__m256)(Y), (int)(P)))
+
+#define _mm_cmp_sd(X, Y, P)                                            \
+  ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X),               \
+                                  (__v2df)(__m128d)(Y), (int)(P)))
+
+#define _mm_cmp_ss(X, Y, P)                                            \
+  ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X),                 \
+                                 (__v4sf)(__m128)(Y), (int)(P)))
+#endif
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtsi256_si32 (__m256i __A)
+{
+  __v8si __B = (__v8si) __A;
+  return __B[0];
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi32_pd (__m128i __A)
+{
+  return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi32_ps (__m256i __A)
+{
+  return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtpd_ps (__m256d __A)
+{
+  return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtps_epi32 (__m256 __A)
+{
+  return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtps_pd (__m128 __A)
+{
+  return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttpd_epi32 (__m256d __A)
+{
+  return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtpd_epi32 (__m256d __A)
+{
+  return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttps_epi32 (__m256 __A)
+{
+  return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A);
+}
+
+extern __inline double
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtsd_f64 (__m256d __A)
+{
+  return __A[0];
+}
+
+extern __inline float
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtss_f32 (__m256 __A)
+{
+  return __A[0];
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extractf128_pd (__m256d __X, const int __N)
+{
+  return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extractf128_ps (__m256 __X, const int __N)
+{
+  return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extractf128_si256 (__m256i __X, const int __N)
+{
+  return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extract_epi32 (__m256i __X, int const __N)
+{
+  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
+  return _mm_extract_epi32 (__Y, __N % 4);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extract_epi16 (__m256i __X, int const __N)
+{
+  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
+  return _mm_extract_epi16 (__Y, __N % 8);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extract_epi8 (__m256i __X, int const __N)
+{
+  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
+  return _mm_extract_epi8 (__Y, __N % 16);
+}
+
+#ifdef __x86_64__
+extern __inline long long  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extract_epi64 (__m256i __X, const int __N)
+{
+  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
+  return _mm_extract_epi64 (__Y, __N % 2);
+}
+#endif
+#else
+#define _mm256_extractf128_pd(X, N)                                    \
+  ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X),  \
+                                               (int)(N)))
+
+#define _mm256_extractf128_ps(X, N)                                    \
+  ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X),    \
+                                              (int)(N)))
+
+#define _mm256_extractf128_si256(X, N)                                 \
+  ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X),  \
+                                               (int)(N)))
+
+#define _mm256_extract_epi32(X, N)                                     \
+  (__extension__                                                       \
+   ({                                                                  \
+      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2);          \
+      _mm_extract_epi32 (__Y, (N) % 4);                                        \
+    }))
+
+#define _mm256_extract_epi16(X, N)                                     \
+  (__extension__                                                       \
+   ({                                                                  \
+      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3);          \
+      _mm_extract_epi16 (__Y, (N) % 8);                                        \
+    }))
+
+#define _mm256_extract_epi8(X, N)                                      \
+  (__extension__                                                       \
+   ({                                                                  \
+      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4);          \
+      _mm_extract_epi8 (__Y, (N) % 16);                                        \
+    }))
+
+#ifdef __x86_64__
+#define _mm256_extract_epi64(X, N)                                     \
+  (__extension__                                                       \
+   ({                                                                  \
+      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1);          \
+      _mm_extract_epi64 (__Y, (N) % 2);                                        \
+    }))
+#endif
+#endif
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_zeroall (void)
+{
+  __builtin_ia32_vzeroall ();
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_zeroupper (void)
+{
+  __builtin_ia32_vzeroupper ();
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutevar_pd (__m128d __A, __m128i __C)
+{
+  return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A,
+                                               (__v2di)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutevar_pd (__m256d __A, __m256i __C)
+{
+  return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A,
+                                                  (__v4di)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutevar_ps (__m128 __A, __m128i __C)
+{
+  return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A,
+                                              (__v4si)__C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutevar_ps (__m256 __A, __m256i __C)
+{
+  return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A,
+                                                 (__v8si)__C);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permute_pd (__m128d __X, const int __C)
+{
+  return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute_pd (__m256d __X, const int __C)
+{
+  return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permute_ps (__m128 __X, const int __C)
+{
+  return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute_ps (__m256 __X, const int __C)
+{
+  return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C);
+}
+#else
+#define _mm_permute_pd(X, C)                                           \
+  ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C)))
+
+#define _mm256_permute_pd(X, C)                                                \
+  ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X),        (int)(C)))
+
+#define _mm_permute_ps(X, C)                                           \
+  ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C)))
+
+#define _mm256_permute_ps(X, C)                                                \
+  ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C)))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C)
+{
+  return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X,
+                                                   (__v4df)__Y,
+                                                   __C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C)
+{
+  return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X,
+                                                  (__v8sf)__Y,
+                                                  __C);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C)
+{
+  return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X,
+                                                   (__v8si)__Y,
+                                                   __C);
+}
+#else
+#define _mm256_permute2f128_pd(X, Y, C)                                        \
+  ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X),    \
+                                             (__v4df)(__m256d)(Y),     \
+                                             (int)(C)))
+
+#define _mm256_permute2f128_ps(X, Y, C)                                        \
+  ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X),      \
+                                            (__v8sf)(__m256)(Y),       \
+                                            (int)(C)))
+
+#define _mm256_permute2f128_si256(X, Y, C)                             \
+  ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X),    \
+                                             (__v8si)(__m256i)(Y),     \
+                                             (int)(C)))
+#endif
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcast_ss (float const *__X)
+{
+  return (__m128) __builtin_ia32_vbroadcastss (__X);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcast_sd (double const *__X)
+{
+  return (__m256d) __builtin_ia32_vbroadcastsd256 (__X);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcast_ss (float const *__X)
+{
+  return (__m256) __builtin_ia32_vbroadcastss256 (__X);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcast_pd (__m128d const *__X)
+{
+  return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcast_ps (__m128 const *__X)
+{
+  return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O)
+{
+  return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X,
+                                                    (__v2df)__Y,
+                                                    __O);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O)
+{
+  return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X,
+                                                   (__v4sf)__Y,
+                                                   __O);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O)
+{
+  return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X,
+                                                    (__v4si)__Y,
+                                                    __O);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_insert_epi32 (__m256i __X, int __D, int const __N)
+{
+  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
+  __Y = _mm_insert_epi32 (__Y, __D, __N % 4);
+  return _mm256_insertf128_si256 (__X, __Y, __N >> 2);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_insert_epi16 (__m256i __X, int __D, int const __N)
+{
+  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
+  __Y = _mm_insert_epi16 (__Y, __D, __N % 8);
+  return _mm256_insertf128_si256 (__X, __Y, __N >> 3);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_insert_epi8 (__m256i __X, int __D, int const __N)
+{
+  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
+  __Y = _mm_insert_epi8 (__Y, __D, __N % 16);
+  return _mm256_insertf128_si256 (__X, __Y, __N >> 4);
+}
+
+#ifdef __x86_64__
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_insert_epi64 (__m256i __X, long long __D, int const __N)
+{
+  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
+  __Y = _mm_insert_epi64 (__Y, __D, __N % 2);
+  return _mm256_insertf128_si256 (__X, __Y, __N >> 1);
+}
+#endif
+#else
+#define _mm256_insertf128_pd(X, Y, O)                                  \
+  ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X),   \
+                                              (__v2df)(__m128d)(Y),    \
+                                              (int)(O)))
+
+#define _mm256_insertf128_ps(X, Y, O)                                  \
+  ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X),     \
+                                             (__v4sf)(__m128)(Y),      \
+                                             (int)(O)))
+
+#define _mm256_insertf128_si256(X, Y, O)                               \
+  ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X),   \
+                                              (__v4si)(__m128i)(Y),    \
+                                              (int)(O)))
+
+#define _mm256_insert_epi32(X, D, N)                                   \
+  (__extension__                                                       \
+   ({                                                                  \
+      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2);          \
+      __Y = _mm_insert_epi32 (__Y, (D), (N) % 4);                      \
+      _mm256_insertf128_si256 ((X), __Y, (N) >> 2);                    \
+    }))
+
+#define _mm256_insert_epi16(X, D, N)                                   \
+  (__extension__                                                       \
+   ({                                                                  \
+      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3);          \
+      __Y = _mm_insert_epi16 (__Y, (D), (N) % 8);                      \
+      _mm256_insertf128_si256 ((X), __Y, (N) >> 3);                    \
+    }))
+
+#define _mm256_insert_epi8(X, D, N)                                    \
+  (__extension__                                                       \
+   ({                                                                  \
+      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4);          \
+      __Y = _mm_insert_epi8 (__Y, (D), (N) % 16);                      \
+      _mm256_insertf128_si256 ((X), __Y, (N) >> 4);                    \
+    }))
+
+#ifdef __x86_64__
+#define _mm256_insert_epi64(X, D, N)                                   \
+  (__extension__                                                       \
+   ({                                                                  \
+      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1);          \
+      __Y = _mm_insert_epi64 (__Y, (D), (N) % 2);                      \
+      _mm256_insertf128_si256 ((X), __Y, (N) >> 1);                    \
+    }))
+#endif
+#endif
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_load_pd (double const *__P)
+{
+  return *(__m256d *)__P;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_store_pd (double *__P, __m256d __A)
+{
+  *(__m256d *)__P = __A;
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_load_ps (float const *__P)
+{
+  return *(__m256 *)__P;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_store_ps (float *__P, __m256 __A)
+{
+  *(__m256 *)__P = __A;
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu_pd (double const *__P)
+{
+  return *(__m256d_u *)__P;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu_pd (double *__P, __m256d __A)
+{
+  *(__m256d_u *)__P = __A;
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu_ps (float const *__P)
+{
+  return *(__m256_u *)__P;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu_ps (float *__P, __m256 __A)
+{
+  *(__m256_u *)__P = __A;
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_load_si256 (__m256i const *__P)
+{
+  return *__P;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_store_si256 (__m256i *__P, __m256i __A)
+{
+  *__P = __A;
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu_si256 (__m256i_u const *__P)
+{
+  return *__P;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu_si256 (__m256i_u *__P, __m256i __A)
+{
+  *__P = __A;
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskload_pd (double const *__P, __m128i __M)
+{
+  return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P,
+                                             (__v2di)__M);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskstore_pd (double *__P, __m128i __M, __m128d __A)
+{
+  __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2di)__M, (__v2df)__A);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskload_pd (double const *__P, __m256i __M)
+{
+  return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P,
+                                                (__v4di)__M);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskstore_pd (double *__P, __m256i __M, __m256d __A)
+{
+  __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4di)__M, (__v4df)__A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskload_ps (float const *__P, __m128i __M)
+{
+  return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P,
+                                            (__v4si)__M);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskstore_ps (float *__P, __m128i __M, __m128 __A)
+{
+  __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4si)__M, (__v4sf)__A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskload_ps (float const *__P, __m256i __M)
+{
+  return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P,
+                                               (__v8si)__M);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskstore_ps (float *__P, __m256i __M, __m256 __A)
+{
+  __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8si)__M, (__v8sf)__A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movehdup_ps (__m256 __X)
+{
+  return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_moveldup_ps (__m256 __X)
+{
+  return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movedup_pd (__m256d __X)
+{
+  return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_lddqu_si256 (__m256i const *__P)
+{
+  return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_stream_si256 (__m256i *__A, __m256i __B)
+{
+  __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_stream_pd (double *__A, __m256d __B)
+{
+  __builtin_ia32_movntpd256 (__A, (__v4df)__B);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_stream_ps (float *__P, __m256 __A)
+{
+  __builtin_ia32_movntps256 (__P, (__v8sf)__A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rcp_ps (__m256 __A)
+{
+  return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rsqrt_ps (__m256 __A)
+{
+  return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sqrt_pd (__m256d __A)
+{
+  return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sqrt_ps (__m256 __A)
+{
+  return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_round_pd (__m256d __V, const int __M)
+{
+  return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_round_ps (__m256 __V, const int __M)
+{
+  return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M);
+}
+#else
+#define _mm256_round_pd(V, M) \
+  ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M)))
+
+#define _mm256_round_ps(V, M) \
+  ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M)))
+#endif
+
+#define _mm256_ceil_pd(V)      _mm256_round_pd ((V), _MM_FROUND_CEIL)
+#define _mm256_floor_pd(V)     _mm256_round_pd ((V), _MM_FROUND_FLOOR)
+#define _mm256_ceil_ps(V)      _mm256_round_ps ((V), _MM_FROUND_CEIL)
+#define _mm256_floor_ps(V)     _mm256_round_ps ((V), _MM_FROUND_FLOOR)
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpackhi_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpacklo_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpackhi_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpacklo_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testz_pd (__m128d __M, __m128d __V)
+{
+  return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testc_pd (__m128d __M, __m128d __V)
+{
+  return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testnzc_pd (__m128d __M, __m128d __V)
+{
+  return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testz_ps (__m128 __M, __m128 __V)
+{
+  return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testc_ps (__m128 __M, __m128 __V)
+{
+  return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testnzc_ps (__m128 __M, __m128 __V)
+{
+  return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testz_pd (__m256d __M, __m256d __V)
+{
+  return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testc_pd (__m256d __M, __m256d __V)
+{
+  return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testnzc_pd (__m256d __M, __m256d __V)
+{
+  return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testz_ps (__m256 __M, __m256 __V)
+{
+  return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testc_ps (__m256 __M, __m256 __V)
+{
+  return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testnzc_ps (__m256 __M, __m256 __V)
+{
+  return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testz_si256 (__m256i __M, __m256i __V)
+{
+  return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testc_si256 (__m256i __M, __m256i __V)
+{
+  return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testnzc_si256 (__m256i __M, __m256i __V)
+{
+  return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movemask_pd (__m256d __A)
+{
+  return __builtin_ia32_movmskpd256 ((__v4df)__A);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movemask_ps (__m256 __A)
+{
+  return __builtin_ia32_movmskps256 ((__v8sf)__A);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_undefined_pd (void)
+{
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Winit-self"
+  __m256d __Y = __Y;
+#pragma GCC diagnostic pop
+  return __Y;
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_undefined_ps (void)
+{
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Winit-self"
+  __m256 __Y = __Y;
+#pragma GCC diagnostic pop
+  return __Y;
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_undefined_si256 (void)
+{
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Winit-self"
+  __m256i __Y = __Y;
+#pragma GCC diagnostic pop
+  return __Y;
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setzero_pd (void)
+{
+  return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setzero_ps (void)
+{
+  return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0,
+                                0.0, 0.0, 0.0, 0.0 };
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setzero_si256 (void)
+{
+  return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
+}
+
+/* Create the vector [A B C D].  */
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_pd (double __A, double __B, double __C, double __D)
+{
+  return __extension__ (__m256d){ __D, __C, __B, __A };
+}
+
+/* Create the vector [A B C D E F G H].  */
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_ps (float __A, float __B, float __C, float __D,
+              float __E, float __F, float __G, float __H)
+{
+  return __extension__ (__m256){ __H, __G, __F, __E,
+                                __D, __C, __B, __A };
+}
+
+/* Create the vector [A B C D E F G H].  */
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_epi32 (int __A, int __B, int __C, int __D,
+                 int __E, int __F, int __G, int __H)
+{
+  return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E,
+                                         __D, __C, __B, __A };
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12,
+                 short __q11, short __q10, short __q09, short __q08,
+                 short __q07, short __q06, short __q05, short __q04,
+                 short __q03, short __q02, short __q01, short __q00)
+{
+  return __extension__ (__m256i)(__v16hi){
+    __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
+    __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
+  };
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_epi8  (char __q31, char __q30, char __q29, char __q28,
+                 char __q27, char __q26, char __q25, char __q24,
+                 char __q23, char __q22, char __q21, char __q20,
+                 char __q19, char __q18, char __q17, char __q16,
+                 char __q15, char __q14, char __q13, char __q12,
+                 char __q11, char __q10, char __q09, char __q08,
+                 char __q07, char __q06, char __q05, char __q04,
+                 char __q03, char __q02, char __q01, char __q00)
+{
+  return __extension__ (__m256i)(__v32qi){
+    __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
+    __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
+    __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
+    __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31
+  };
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_epi64x (long long __A, long long __B, long long __C,
+                  long long __D)
+{
+  return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A };
+}
+
+/* Create a vector with all elements equal to A.  */
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_pd (double __A)
+{
+  return __extension__ (__m256d){ __A, __A, __A, __A };
+}
+
+/* Create a vector with all elements equal to A.  */
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_ps (float __A)
+{
+  return __extension__ (__m256){ __A, __A, __A, __A,
+                                __A, __A, __A, __A };
+}
+
+/* Create a vector with all elements equal to A.  */
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_epi32 (int __A)
+{
+  return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A,
+                                         __A, __A, __A, __A };
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_epi16 (short __A)
+{
+  return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A,
+                          __A, __A, __A, __A, __A, __A, __A, __A);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_epi8 (char __A)
+{
+  return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
+                         __A, __A, __A, __A, __A, __A, __A, __A,
+                         __A, __A, __A, __A, __A, __A, __A, __A,
+                         __A, __A, __A, __A, __A, __A, __A, __A);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_epi64x (long long __A)
+{
+  return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A };
+}
+
+/* Create vectors of elements in the reversed order from the
+   _mm256_set_XXX functions.  */
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_pd (double __A, double __B, double __C, double __D)
+{
+  return _mm256_set_pd (__D, __C, __B, __A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_ps (float __A, float __B, float __C, float __D,
+               float __E, float __F, float __G, float __H)
+{
+  return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_epi32 (int __A, int __B, int __C, int __D,
+                  int __E, int __F, int __G, int __H)
+{
+  return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12,
+                  short __q11, short __q10, short __q09, short __q08,
+                  short __q07, short __q06, short __q05, short __q04,
+                  short __q03, short __q02, short __q01, short __q00)
+{
+  return _mm256_set_epi16 (__q00, __q01, __q02, __q03,
+                          __q04, __q05, __q06, __q07,
+                          __q08, __q09, __q10, __q11,
+                          __q12, __q13, __q14, __q15);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_epi8  (char __q31, char __q30, char __q29, char __q28,
+                  char __q27, char __q26, char __q25, char __q24,
+                  char __q23, char __q22, char __q21, char __q20,
+                  char __q19, char __q18, char __q17, char __q16,
+                  char __q15, char __q14, char __q13, char __q12,
+                  char __q11, char __q10, char __q09, char __q08,
+                  char __q07, char __q06, char __q05, char __q04,
+                  char __q03, char __q02, char __q01, char __q00)
+{
+  return _mm256_set_epi8 (__q00, __q01, __q02, __q03,
+                         __q04, __q05, __q06, __q07,
+                         __q08, __q09, __q10, __q11,
+                         __q12, __q13, __q14, __q15,
+                         __q16, __q17, __q18, __q19,
+                         __q20, __q21, __q22, __q23,
+                         __q24, __q25, __q26, __q27,
+                         __q28, __q29, __q30, __q31);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_epi64x (long long __A, long long __B, long long __C,
+                   long long __D)
+{
+  return _mm256_set_epi64x (__D, __C, __B, __A);
+}
+
+/* Casts between various SP, DP, INT vector types.  Note that these do no
+   conversion of values, they just change the type.  */
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castpd_ps (__m256d __A)
+{
+  return (__m256) __A;
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castpd_si256 (__m256d __A)
+{
+  return (__m256i) __A;
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castps_pd (__m256 __A)
+{
+  return (__m256d) __A;
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castps_si256(__m256 __A)
+{
+  return (__m256i) __A;
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castsi256_ps (__m256i __A)
+{
+  return (__m256) __A;
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castsi256_pd (__m256i __A)
+{
+  return (__m256d) __A;
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castpd256_pd128 (__m256d __A)
+{
+  return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castps256_ps128 (__m256 __A)
+{
+  return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castsi256_si128 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A);
+}
+
+/* When cast is done from a 128 to 256-bit type, the low 128 bits of
+   the 256-bit result contain source parameter value and the upper 128
+   bits of the result are undefined.  Those intrinsics shouldn't
+   generate any extra moves.  */
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castpd128_pd256 (__m128d __A)
+{
+  return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castps128_ps256 (__m128 __A)
+{
+  return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castsi128_si256 (__m128i __A)
+{
+  return (__m256i) __builtin_ia32_si256_si ((__v4si)__A);
+}
+
+/* Similarly, but with zero extension instead of undefined values.  */
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_zextpd128_pd256 (__m128d __A)
+{
+  return _mm256_insertf128_pd (_mm256_setzero_pd (), __A, 0);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_zextps128_ps256 (__m128 __A)
+{
+  return _mm256_insertf128_ps (_mm256_setzero_ps (), __A, 0);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_zextsi128_si256 (__m128i __A)
+{
+  return _mm256_insertf128_si256 (_mm256_setzero_si256 (), __A, 0);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_m128 ( __m128 __H, __m128 __L)
+{
+  return _mm256_insertf128_ps (_mm256_castps128_ps256 (__L), __H, 1);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_m128d (__m128d __H, __m128d __L)
+{
+  return _mm256_insertf128_pd (_mm256_castpd128_pd256 (__L), __H, 1);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_m128i (__m128i __H, __m128i __L)
+{
+  return _mm256_insertf128_si256 (_mm256_castsi128_si256 (__L), __H, 1);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_m128 (__m128 __L, __m128 __H)
+{
+  return _mm256_set_m128 (__H, __L);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_m128d (__m128d __L, __m128d __H)
+{
+  return _mm256_set_m128d (__H, __L);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_m128i (__m128i __L, __m128i __H)
+{
+  return _mm256_set_m128i (__H, __L);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu2_m128 (float const *__PH, float const *__PL)
+{
+  return _mm256_insertf128_ps (_mm256_castps128_ps256 (_mm_loadu_ps (__PL)),
+                              _mm_loadu_ps (__PH), 1);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu2_m128 (float *__PH, float *__PL, __m256 __A)
+{
+  _mm_storeu_ps (__PL, _mm256_castps256_ps128 (__A));
+  _mm_storeu_ps (__PH, _mm256_extractf128_ps (__A, 1));
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu2_m128d (double const *__PH, double const *__PL)
+{
+  return _mm256_insertf128_pd (_mm256_castpd128_pd256 (_mm_loadu_pd (__PL)),
+                              _mm_loadu_pd (__PH), 1);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu2_m128d (double *__PH, double *__PL, __m256d __A)
+{
+  _mm_storeu_pd (__PL, _mm256_castpd256_pd128 (__A));
+  _mm_storeu_pd (__PH, _mm256_extractf128_pd (__A, 1));
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu2_m128i (__m128i_u const *__PH, __m128i_u const *__PL)
+{
+  return _mm256_insertf128_si256 (_mm256_castsi128_si256 (_mm_loadu_si128 (__PL)),
+                                 _mm_loadu_si128 (__PH), 1);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu2_m128i (__m128i_u *__PH, __m128i_u *__PL, __m256i __A)
+{
+  _mm_storeu_si128 (__PL, _mm256_castsi256_si128 (__A));
+  _mm_storeu_si128 (__PH, _mm256_extractf128_si256 (__A, 1));
+}
+
+#ifdef __DISABLE_AVX__
+#undef __DISABLE_AVX__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVX__ */
+
+#endif /* _AVXINTRIN_H_INCLUDED */
diff --git a/include-gcc/avxneconvertintrin.h b/include-gcc/avxneconvertintrin.h
new file mode 100644 (file)
index 0000000..7a90ae1
--- /dev/null
@@ -0,0 +1,140 @@
+/* Copyright (C) 2021-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avxneconvertintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVXNECONVERTINTRIN_H_INCLUDED
+#define _AVXNECONVERTINTRIN_H_INCLUDED
+
+#ifndef __AVXNECONVERT__
+#pragma GCC push_options
+#pragma GCC target ("avxneconvert")
+#define __DISABLE_AVXNECONVERT__
+#endif /* __AVXNECONVERT__ */
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_bcstnebf16_ps (const void *__P)
+{
+  return (__m128) __builtin_ia32_vbcstnebf162ps128 ((const __bf16 *) __P);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_bcstnebf16_ps (const void *__P)
+{
+  return (__m256) __builtin_ia32_vbcstnebf162ps256 ((const __bf16 *) __P);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_bcstnesh_ps (const void *__P)
+{
+  return (__m128) __builtin_ia32_vbcstnesh2ps128 ((const _Float16 *) __P);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_bcstnesh_ps (const void *__P)
+{
+  return (__m256) __builtin_ia32_vbcstnesh2ps256 ((const _Float16 *) __P);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtneebf16_ps (const __m128bh *__A)
+{
+  return (__m128) __builtin_ia32_vcvtneebf162ps128 ((const __v8bf *) __A);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtneebf16_ps (const __m256bh *__A)
+{
+  return (__m256) __builtin_ia32_vcvtneebf162ps256 ((const __v16bf *) __A);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtneeph_ps (const __m128h *__A)
+{
+  return (__m128) __builtin_ia32_vcvtneeph2ps128 ((const __v8hf *) __A);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtneeph_ps (const __m256h *__A)
+{
+  return (__m256) __builtin_ia32_vcvtneeph2ps256 ((const __v16hf *) __A);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtneobf16_ps (const __m128bh *__A)
+{
+  return (__m128) __builtin_ia32_vcvtneobf162ps128 ((const __v8bf *) __A);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtneobf16_ps (const __m256bh *__A)
+{
+  return (__m256) __builtin_ia32_vcvtneobf162ps256 ((const __v16bf *) __A);
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtneoph_ps (const __m128h *__A)
+{
+  return (__m128) __builtin_ia32_vcvtneoph2ps128 ((const __v8hf *) __A);
+}
+
+extern __inline __m256
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtneoph_ps (const __m256h *__A)
+{
+  return (__m256) __builtin_ia32_vcvtneoph2ps256 ((const __v16hf *) __A);
+}
+
+extern __inline __m128bh
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtneps_avx_pbh (__m128 __A)
+{
+  return (__m128bh) __builtin_ia32_cvtneps2bf16_v4sf (__A);
+}
+
+extern __inline __m128bh
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtneps_avx_pbh (__m256 __A)
+{
+  return (__m128bh) __builtin_ia32_cvtneps2bf16_v8sf (__A);
+}
+
+#ifdef __DISABLE_AVXNECONVERT__
+#undef __DISABLE_AVXNECONVERT__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVXNECONVERT__ */
+
+#endif /* _AVXNECONVERTINTRIN_H_INCLUDED */
diff --git a/include-gcc/avxvnniint8intrin.h b/include-gcc/avxvnniint8intrin.h
new file mode 100644 (file)
index 0000000..9f8f174
--- /dev/null
@@ -0,0 +1,138 @@
+/* Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+#error "Never use <avxvnniint8vlintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVXVNNIINT8INTRIN_H_INCLUDED
+#define _AVXVNNIINT8INTRIN_H_INCLUDED
+
+#if !defined(__AVXVNNIINT8__)
+#pragma GCC push_options
+#pragma GCC target("avxvnniint8")
+#define __DISABLE_AVXVNNIINT8__
+#endif /* __AVXVNNIINT8__ */
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_dpbssd_epi32 (__m128i __W, __m128i __A, __m128i __B)
+{
+  return (__m128i)
+    __builtin_ia32_vpdpbssd128 ((__v4si) __W, (__v4si) __A, (__v4si) __B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_dpbssds_epi32 (__m128i __W, __m128i __A, __m128i __B)
+{
+  return (__m128i)
+    __builtin_ia32_vpdpbssds128 ((__v4si) __W, (__v4si) __A, (__v4si) __B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_dpbsud_epi32 (__m128i __W, __m128i __A, __m128i __B)
+{
+  return (__m128i)
+    __builtin_ia32_vpdpbsud128 ((__v4si) __W, (__v4si) __A, (__v4si) __B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_dpbsuds_epi32 (__m128i __W, __m128i __A, __m128i __B)
+{
+  return (__m128i)
+    __builtin_ia32_vpdpbsuds128 ((__v4si) __W, (__v4si) __A, (__v4si) __B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_dpbuud_epi32 (__m128i __W, __m128i __A, __m128i __B)
+{
+  return (__m128i)
+    __builtin_ia32_vpdpbuud128 ((__v4si) __W, (__v4si) __A, (__v4si) __B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_dpbuuds_epi32 (__m128i __W, __m128i __A, __m128i __B)
+{
+  return (__m128i)
+    __builtin_ia32_vpdpbuuds128 ((__v4si) __W, (__v4si) __A, (__v4si) __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_dpbssd_epi32 (__m256i __W, __m256i __A, __m256i __B)
+{
+  return (__m256i)
+    __builtin_ia32_vpdpbssd256 ((__v8si) __W, (__v8si) __A, (__v8si) __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_dpbssds_epi32 (__m256i __W, __m256i __A, __m256i __B)
+{
+  return (__m256i)
+    __builtin_ia32_vpdpbssds256 ((__v8si) __W, (__v8si) __A, (__v8si) __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_dpbsud_epi32 (__m256i __W, __m256i __A, __m256i __B)
+{
+  return (__m256i)
+    __builtin_ia32_vpdpbsud256 ((__v8si) __W, (__v8si) __A, (__v8si) __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_dpbsuds_epi32 (__m256i __W, __m256i __A, __m256i __B)
+{
+  return (__m256i)
+    __builtin_ia32_vpdpbsuds256 ((__v8si) __W, (__v8si) __A, (__v8si) __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_dpbuud_epi32 (__m256i __W, __m256i __A, __m256i __B)
+{
+  return (__m256i)
+    __builtin_ia32_vpdpbuud256 ((__v8si) __W, (__v8si) __A, (__v8si) __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_dpbuuds_epi32 (__m256i __W, __m256i __A, __m256i __B)
+{
+  return (__m256i)
+    __builtin_ia32_vpdpbuuds256 ((__v8si) __W, (__v8si) __A, (__v8si) __B);
+}
+
+#ifdef __DISABLE_AVXVNNIINT8__
+#undef __DISABLE_AVXVNNIINT8__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVXVNNIINT8__ */
+
+#endif /* __AVXVNNIINT8INTRIN_H_INCLUDED */
diff --git a/include-gcc/avxvnniintrin.h b/include-gcc/avxvnniintrin.h
new file mode 100644 (file)
index 0000000..cdea8a9
--- /dev/null
@@ -0,0 +1,113 @@
+/* Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <avxvnniintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _AVXVNNIINTRIN_H_INCLUDED
+#define _AVXVNNIINTRIN_H_INCLUDED
+
+#if !defined(__AVXVNNI__)
+#pragma GCC push_options
+#pragma GCC target("avxvnni")
+#define __DISABLE_AVXVNNIVL__
+#endif /* __AVXVNNIVL__ */
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_dpbusd_avx_epi32(__m256i __A, __m256i __B, __m256i __C)
+{
+  return   (__m256i) __builtin_ia32_vpdpbusd_v8si ((__v8si) __A,
+                                                  (__v8si) __B,
+                                                  (__v8si) __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_dpbusd_avx_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i) __builtin_ia32_vpdpbusd_v4si ((__v4si) __A,
+                                                (__v4si) __B,
+                                                (__v4si) __C);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_dpbusds_avx_epi32(__m256i __A, __m256i __B, __m256i __C)
+{
+  return (__m256i) __builtin_ia32_vpdpbusds_v8si ((__v8si) __A,
+                                                 (__v8si) __B,
+                                                 (__v8si) __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_dpbusds_avx_epi32(__m128i __A,__m128i __B,__m128i __C)
+{
+  return (__m128i) __builtin_ia32_vpdpbusds_v4si ((__v4si) __A,
+                                                 (__v4si) __B,
+                                                 (__v4si) __C);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_dpwssd_avx_epi32(__m256i __A,__m256i __B,__m256i __C)
+{
+  return (__m256i) __builtin_ia32_vpdpwssd_v8si ((__v8si) __A,
+                                                (__v8si) __B,
+                                                (__v8si) __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_dpwssd_avx_epi32(__m128i __A,__m128i __B,__m128i __C)
+{
+  return (__m128i) __builtin_ia32_vpdpwssd_v4si ((__v4si) __A,
+                                                (__v4si) __B,
+                                                (__v4si) __C);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_dpwssds_avx_epi32(__m256i __A,__m256i __B,__m256i __C)
+{
+  return (__m256i) __builtin_ia32_vpdpwssds_v8si ((__v8si) __A,
+                                                 (__v8si) __B,
+                                                 (__v8si) __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_dpwssds_avx_epi32(__m128i __A,__m128i __B,__m128i __C)
+{
+  return (__m128i) __builtin_ia32_vpdpwssds_v4si ((__v4si) __A,
+                                                 (__v4si) __B,
+                                                 (__v4si) __C);
+}
+
+#ifdef __DISABLE_AVXVNNIVL__
+#undef __DISABLE_AVXVNNIVL__
+#pragma GCC pop_options
+#endif /* __DISABLE_AVXVNNIVL__ */
+#endif /* _AVXVNNIINTRIN_H_INCLUDED */
diff --git a/include-gcc/bmi2intrin.h b/include-gcc/bmi2intrin.h
new file mode 100644 (file)
index 0000000..c9915a5
--- /dev/null
@@ -0,0 +1,109 @@
+/* Copyright (C) 2011-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _BMI2INTRIN_H_INCLUDED
+#define _BMI2INTRIN_H_INCLUDED
+
+#ifndef __BMI2__
+#pragma GCC push_options
+#pragma GCC target("bmi2")
+#define __DISABLE_BMI2__
+#endif /* __BMI2__ */
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bzhi_u32 (unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_bzhi_si (__X, __Y);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pdep_u32 (unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_pdep_si (__X, __Y);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pext_u32 (unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_pext_si (__X, __Y);
+}
+
+#ifdef  __x86_64__
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bzhi_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_bzhi_di (__X, __Y);
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pdep_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_pdep_di (__X, __Y);
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pext_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_pext_di (__X, __Y);
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mulx_u64 (unsigned long long __X, unsigned long long __Y,
+          unsigned long long *__P)
+{
+  unsigned __int128 __res = (unsigned __int128) __X * __Y;
+  *__P = (unsigned long long) (__res >> 64);
+  return (unsigned long long) __res;
+}
+
+#else /* !__x86_64__ */
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P)
+{
+  unsigned long long __res = (unsigned long long) __X * __Y;
+  *__P = (unsigned int) (__res >> 32);
+  return (unsigned int) __res;
+}
+
+#endif /* !__x86_64__  */
+
+#ifdef __DISABLE_BMI2__
+#undef __DISABLE_BMI2__
+#pragma GCC pop_options
+#endif /* __DISABLE_BMI2__ */
+
+#endif /* _BMI2INTRIN_H_INCLUDED */
diff --git a/include-gcc/bmiintrin.h b/include-gcc/bmiintrin.h
new file mode 100644 (file)
index 0000000..ec8945d
--- /dev/null
@@ -0,0 +1,202 @@
+/* Copyright (C) 2010-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <bmiintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _BMIINTRIN_H_INCLUDED
+#define _BMIINTRIN_H_INCLUDED
+
+#ifndef __BMI__
+#pragma GCC push_options
+#pragma GCC target("bmi")
+#define __DISABLE_BMI__
+#endif /* __BMI__ */
+
+extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__tzcnt_u16 (unsigned short __X)
+{
+  return __builtin_ia32_tzcnt_u16 (__X);
+}
+
+extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_tzcnt_u16 (unsigned short __X)
+{
+  return __builtin_ia32_tzcnt_u16 (__X);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__andn_u32 (unsigned int __X, unsigned int __Y)
+{
+  return ~__X & __Y;
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_andn_u32 (unsigned int __X, unsigned int __Y)
+{
+  return __andn_u32 (__X, __Y);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bextr_u32 (unsigned int __X, unsigned int __Y)
+{
+  return __builtin_ia32_bextr_u32 (__X, __Y);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bextr_u32 (unsigned int __X, unsigned int __Y, unsigned __Z)
+{
+  return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsi_u32 (unsigned int __X)
+{
+  return __X & -__X;
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_blsi_u32 (unsigned int __X)
+{
+  return __blsi_u32 (__X);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsmsk_u32 (unsigned int __X)
+{
+  return __X ^ (__X - 1);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_blsmsk_u32 (unsigned int __X)
+{
+  return __blsmsk_u32 (__X);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsr_u32 (unsigned int __X)
+{
+  return __X & (__X - 1);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_blsr_u32 (unsigned int __X)
+{
+  return __blsr_u32 (__X);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__tzcnt_u32 (unsigned int __X)
+{
+  return __builtin_ia32_tzcnt_u32 (__X);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_tzcnt_u32 (unsigned int __X)
+{
+  return __builtin_ia32_tzcnt_u32 (__X);
+}
+
+
+#ifdef  __x86_64__
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__andn_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return ~__X & __Y;
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_andn_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return __andn_u64 (__X, __Y);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bextr_u64 (unsigned long long __X, unsigned long long __Y)
+{
+  return __builtin_ia32_bextr_u64 (__X, __Y);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bextr_u64 (unsigned long long __X, unsigned int __Y, unsigned int __Z)
+{
+  return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8)));
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsi_u64 (unsigned long long __X)
+{
+  return __X & -__X;
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_blsi_u64 (unsigned long long __X)
+{
+  return __blsi_u64 (__X);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsmsk_u64 (unsigned long long __X)
+{
+  return __X ^ (__X - 1);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_blsmsk_u64 (unsigned long long __X)
+{
+  return __blsmsk_u64 (__X);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsr_u64 (unsigned long long __X)
+{
+  return __X & (__X - 1);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_blsr_u64 (unsigned long long __X)
+{
+  return __blsr_u64 (__X);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__tzcnt_u64 (unsigned long long __X)
+{
+  return __builtin_ia32_tzcnt_u64 (__X);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_tzcnt_u64 (unsigned long long __X)
+{
+  return __builtin_ia32_tzcnt_u64 (__X);
+}
+
+#endif /* __x86_64__  */
+
+#ifdef __DISABLE_BMI__
+#undef __DISABLE_BMI__
+#pragma GCC pop_options
+#endif /* __DISABLE_BMI__ */
+
+#endif /* _BMIINTRIN_H_INCLUDED */
diff --git a/include-gcc/cetintrin.h b/include-gcc/cetintrin.h
new file mode 100644 (file)
index 0000000..db21a4c
--- /dev/null
@@ -0,0 +1,129 @@
+/* Copyright (C) 2015-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <cetintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _CETINTRIN_H_INCLUDED
+#define _CETINTRIN_H_INCLUDED
+
+#ifndef __SHSTK__
+#pragma GCC push_options
+#pragma GCC target ("shstk")
+#define __DISABLE_SHSTK__
+#endif /* __SHSTK__ */
+
+#ifdef __x86_64__
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_get_ssp (void)
+{
+  return __builtin_ia32_rdsspq ();
+}
+#else
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_get_ssp (void)
+{
+  return __builtin_ia32_rdsspd ();
+}
+#endif
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_inc_ssp (unsigned int __B)
+{
+#ifdef __x86_64__
+  __builtin_ia32_incsspq ((unsigned long long) __B);
+#else
+  __builtin_ia32_incsspd (__B);
+#endif
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_saveprevssp (void)
+{
+  __builtin_ia32_saveprevssp ();
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_rstorssp (void *__B)
+{
+  __builtin_ia32_rstorssp (__B);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_wrssd (unsigned int __B, void *__C)
+{
+  __builtin_ia32_wrssd (__B, __C);
+}
+
+#ifdef __x86_64__
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_wrssq (unsigned long long __B, void *__C)
+{
+  __builtin_ia32_wrssq (__B, __C);
+}
+#endif
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_wrussd (unsigned int __B, void *__C)
+{
+  __builtin_ia32_wrussd (__B, __C);
+}
+
+#ifdef __x86_64__
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_wrussq (unsigned long long __B, void *__C)
+{
+  __builtin_ia32_wrussq (__B, __C);
+}
+#endif
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_setssbsy (void)
+{
+  __builtin_ia32_setssbsy ();
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_clrssbsy (void *__B)
+{
+  __builtin_ia32_clrssbsy (__B);
+}
+
+#ifdef __DISABLE_SHSTK__
+#undef __DISABLE_SHSTK__
+#pragma GCC pop_options
+#endif /* __DISABLE_SHSTK__ */
+
+#endif /* _CETINTRIN_H_INCLUDED.  */
diff --git a/include-gcc/cldemoteintrin.h b/include-gcc/cldemoteintrin.h
new file mode 100644 (file)
index 0000000..0641f67
--- /dev/null
@@ -0,0 +1,47 @@
+/* Copyright (C) 2018-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <cldemoteintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _CLDEMOTE_H_INCLUDED
+#define _CLDEMOTE_H_INCLUDED
+
+#ifndef __CLDEMOTE__
+#pragma GCC push_options
+#pragma GCC target("cldemote")
+#define __DISABLE_CLDEMOTE__
+#endif /* __CLDEMOTE__ */
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_cldemote (void *__A)
+{
+  __builtin_ia32_cldemote (__A);
+}
+#ifdef __DISABLE_CLDEMOTE__
+#undef __DISABLE_CLDEMOTE__
+#pragma GCC pop_options
+#endif /* __DISABLE_CLDEMOTE__ */
+
+#endif /* _CLDEMOTE_H_INCLUDED */
diff --git a/include-gcc/clflushoptintrin.h b/include-gcc/clflushoptintrin.h
new file mode 100644 (file)
index 0000000..8fc45df
--- /dev/null
@@ -0,0 +1,49 @@
+/* Copyright (C) 2013-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <clflushoptintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _CLFLUSHOPTINTRIN_H_INCLUDED
+#define _CLFLUSHOPTINTRIN_H_INCLUDED
+
+#ifndef __CLFLUSHOPT__
+#pragma GCC push_options
+#pragma GCC target("clflushopt")
+#define __DISABLE_CLFLUSHOPT__
+#endif /* __CLFLUSHOPT__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_clflushopt (void *__A)
+{
+  __builtin_ia32_clflushopt (__A);
+}
+
+#ifdef __DISABLE_CLFLUSHOPT__
+#undef __DISABLE_CLFLUSHOPT__
+#pragma GCC pop_options
+#endif /* __DISABLE_CLFLUSHOPT__ */
+
+#endif /* _CLFLUSHOPTINTRIN_H_INCLUDED */
diff --git a/include-gcc/clwbintrin.h b/include-gcc/clwbintrin.h
new file mode 100644 (file)
index 0000000..ef89b03
--- /dev/null
@@ -0,0 +1,49 @@
+/* Copyright (C) 2013-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <clwbintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _CLWBINTRIN_H_INCLUDED
+#define _CLWBINTRIN_H_INCLUDED
+
+#ifndef __CLWB__
+#pragma GCC push_options
+#pragma GCC target("clwb")
+#define __DISABLE_CLWB__
+#endif /* __CLWB__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_clwb (void *__A)
+{
+  __builtin_ia32_clwb (__A);
+}
+
+#ifdef __DISABLE_CLWB__
+#undef __DISABLE_CLWB__
+#pragma GCC pop_options
+#endif /* __DISABLE_CLWB__ */
+
+#endif /* _CLWBINTRIN_H_INCLUDED */
diff --git a/include-gcc/clzerointrin.h b/include-gcc/clzerointrin.h
new file mode 100644 (file)
index 0000000..552ec5d
--- /dev/null
@@ -0,0 +1,44 @@
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _CLZEROINTRIN_H_INCLUDED
+#define _CLZEROINTRIN_H_INCLUDED
+
+#ifndef __CLZERO__
+#pragma GCC push_options
+#pragma GCC target("clzero")
+#define __DISABLE_CLZERO__
+#endif /* __CLZERO__ */
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_clzero (void * __I)
+{
+  __builtin_ia32_clzero (__I);
+}
+
+#ifdef __DISABLE_CLZERO__
+#undef __DISABLE_CLZERO__
+#pragma GCC pop_options
+#endif /* __DISABLE_CLZERO__ */
+
+#endif /* _CLZEROINTRIN_H_INCLUDED */
diff --git a/include-gcc/cmpccxaddintrin.h b/include-gcc/cmpccxaddintrin.h
new file mode 100644 (file)
index 0000000..c458a9d
--- /dev/null
@@ -0,0 +1,89 @@
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+#error "Never use <cmpccxaddintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _CMPCCXADDINTRIN_H_INCLUDED
+#define _CMPCCXADDINTRIN_H_INCLUDED
+
+#ifdef __x86_64__
+
+#ifndef __CMPCCXADD__
+#pragma GCC push_options
+#pragma GCC target("cmpccxadd")
+#define __DISABLE_CMPCCXADD__
+#endif /* __CMPCCXADD__ */
+
+typedef enum {
+    _CMPCCX_O,   /* Overflow.  */
+    _CMPCCX_NO,  /* No overflow.  */
+    _CMPCCX_B,   /* Below.  */
+    _CMPCCX_NB,  /* Not below.  */
+    _CMPCCX_Z,   /* Zero.  */
+    _CMPCCX_NZ,  /* Not zero.  */
+    _CMPCCX_BE,  /* Below or equal.  */
+    _CMPCCX_NBE, /* Neither below nor equal.  */
+    _CMPCCX_S,   /* Sign.  */
+    _CMPCCX_NS,  /* No sign.  */
+    _CMPCCX_P,   /* Parity.  */
+    _CMPCCX_NP,  /* No parity.  */
+    _CMPCCX_L,   /* Less.  */
+    _CMPCCX_NL,  /* Not less.  */
+    _CMPCCX_LE,  /* Less or equal.  */
+    _CMPCCX_NLE, /* Neither less nor equal.  */
+} _CMPCCX_ENUM;
+
+#ifdef __OPTIMIZE__
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_cmpccxadd_epi32 (int *__A, int __B, int __C, const _CMPCCX_ENUM __D)
+{
+  return __builtin_ia32_cmpccxadd (__A, __B, __C, __D);
+}
+
+extern __inline long long
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_cmpccxadd_epi64 (long long *__A, long long __B, long long __C,
+                  const _CMPCCX_ENUM __D)
+{
+  return __builtin_ia32_cmpccxadd64 (__A, __B, __C, __D);
+}
+#else
+#define _cmpccxadd_epi32(A,B,C,D) \
+  __builtin_ia32_cmpccxadd ((int *) (A), (int) (B), (int) (C), \
+                           (_CMPCCX_ENUM) (D))
+#define _cmpccxadd_epi64(A,B,C,D) \
+  __builtin_ia32_cmpccxadd64 ((long long *) (A), (long long) (B), \
+                             (long long) (C), (_CMPCCX_ENUM) (D))
+#endif
+
+#ifdef __DISABLE_CMPCCXADD__
+#undef __DISABLE_CMPCCXADD__
+#pragma GCC pop_options
+#endif /* __DISABLE_CMPCCXADD__ */
+
+#endif
+
+#endif /* _CMPCCXADDINTRIN_H_INCLUDED */
diff --git a/include-gcc/emmintrin.h b/include-gcc/emmintrin.h
new file mode 100644 (file)
index 0000000..3599be7
--- /dev/null
@@ -0,0 +1,1608 @@
+/* Copyright (C) 2003-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+   User Guide and Reference, version 9.0.  */
+
+#ifndef _EMMINTRIN_H_INCLUDED
+#define _EMMINTRIN_H_INCLUDED
+
+/* We need definitions from the SSE header files*/
+#include <xmmintrin.h>
+
+#ifndef __SSE2__
+#pragma GCC push_options
+#pragma GCC target("sse2")
+#define __DISABLE_SSE2__
+#endif /* __SSE2__ */
+
+/* SSE2 */
+typedef double __v2df __attribute__ ((__vector_size__ (16)));
+typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16)));
+typedef int __v4si __attribute__ ((__vector_size__ (16)));
+typedef unsigned int __v4su __attribute__ ((__vector_size__ (16)));
+typedef short __v8hi __attribute__ ((__vector_size__ (16)));
+typedef unsigned short __v8hu __attribute__ ((__vector_size__ (16)));
+typedef char __v16qi __attribute__ ((__vector_size__ (16)));
+typedef signed char __v16qs __attribute__ ((__vector_size__ (16)));
+typedef unsigned char __v16qu __attribute__ ((__vector_size__ (16)));
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+   vector types, and their scalar components.  */
+typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
+typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
+
+/* Unaligned version of the same types.  */
+typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
+typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
+
+/* Create a selector for use with the SHUFPD instruction.  */
+#define _MM_SHUFFLE2(fp1,fp0) \
+ (((fp1) << 1) | (fp0))
+
+/* Create a vector with element 0 as F and the rest zero.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_sd (double __F)
+{
+  return __extension__ (__m128d){ __F, 0.0 };
+}
+
+/* Create a vector with both elements equal to F.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_pd (double __F)
+{
+  return __extension__ (__m128d){ __F, __F };
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pd1 (double __F)
+{
+  return _mm_set1_pd (__F);
+}
+
+/* Create a vector with the lower value X and upper value W.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pd (double __W, double __X)
+{
+  return __extension__ (__m128d){ __X, __W };
+}
+
+/* Create a vector with the lower value W and upper value X.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_pd (double __W, double __X)
+{
+  return __extension__ (__m128d){ __W, __X };
+}
+
+/* Create an undefined vector.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_undefined_pd (void)
+{
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Winit-self"
+  __m128d __Y = __Y;
+#pragma GCC diagnostic pop
+  return __Y;
+}
+
+/* Create a vector of zeros.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setzero_pd (void)
+{
+  return __extension__ (__m128d){ 0.0, 0.0 };
+}
+
+/* Sets the low DPFP value of A from the low value of B.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_move_sd (__m128d __A, __m128d __B)
+{
+  return __extension__ (__m128d) __builtin_shuffle ((__v2df)__A, (__v2df)__B, (__v2di){2, 1});
+}
+
+/* Load two DPFP values from P.  The address must be 16-byte aligned.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_pd (double const *__P)
+{
+  return *(__m128d *)__P;
+}
+
+/* Load two DPFP values from P.  The address need not be 16-byte aligned.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_pd (double const *__P)
+{
+  return *(__m128d_u *)__P;
+}
+
+/* Create a vector with all two elements equal to *P.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load1_pd (double const *__P)
+{
+  return _mm_set1_pd (*__P);
+}
+
+/* Create a vector with element 0 as *P and the rest zero.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_sd (double const *__P)
+{
+  return _mm_set_sd (*__P);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_pd1 (double const *__P)
+{
+  return _mm_load1_pd (__P);
+}
+
+/* Load two DPFP values in reverse order.  The address must be aligned.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadr_pd (double const *__P)
+{
+  __m128d __tmp = _mm_load_pd (__P);
+  return __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,1));
+}
+
+/* Store two DPFP values.  The address must be 16-byte aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_pd (double *__P, __m128d __A)
+{
+  *(__m128d *)__P = __A;
+}
+
+/* Store two DPFP values.  The address need not be 16-byte aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_pd (double *__P, __m128d __A)
+{
+  *(__m128d_u *)__P = __A;
+}
+
+/* Stores the lower DPFP value.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_sd (double *__P, __m128d __A)
+{
+  *__P = ((__v2df)__A)[0];
+}
+
+extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_f64 (__m128d __A)
+{
+  return ((__v2df)__A)[0];
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storel_pd (double *__P, __m128d __A)
+{
+  _mm_store_sd (__P, __A);
+}
+
+/* Stores the upper DPFP value.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeh_pd (double *__P, __m128d __A)
+{
+  *__P = ((__v2df)__A)[1];
+}
+
+/* Store the lower DPFP value across two words.
+   The address must be 16-byte aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store1_pd (double *__P, __m128d __A)
+{
+  _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,0)));
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_pd1 (double *__P, __m128d __A)
+{
+  _mm_store1_pd (__P, __A);
+}
+
+/* Store two DPFP values in reverse order.  The address must be aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storer_pd (double *__P, __m128d __A)
+{
+  _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,1)));
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi128_si32 (__m128i __A)
+{
+  return __builtin_ia32_vec_ext_v4si ((__v4si)__A, 0);
+}
+
+#ifdef __x86_64__
+/* Intel intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi128_si64 (__m128i __A)
+{
+  return ((__v2di)__A)[0];
+}
+
+/* Microsoft intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi128_si64x (__m128i __A)
+{
+  return ((__v2di)__A)[0];
+}
+#endif
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d) ((__v2df)__A + (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d) ((__v2df)__A - (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d) ((__v2df)__A * (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d) ((__v2df)__A / (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_pd (__m128d __A)
+{
+  return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A);
+}
+
+/* Return pair {sqrt (B[0]), A[1]}.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_sd (__m128d __A, __m128d __B)
+{
+  __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
+  return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_minpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_minsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_maxpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_maxsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_and_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_andpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_andnot_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_andnpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_or_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_orpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_xor_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnlt_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnle_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpnlepd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpngt_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpngtpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnge_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpngepd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpord_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpordpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpunord_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpunordpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpeqsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpltsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmplesd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
+                                        (__v2df)
+                                        __builtin_ia32_cmpltsd ((__v2df) __B,
+                                                                (__v2df)
+                                                                __A));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
+                                        (__v2df)
+                                        __builtin_ia32_cmplesd ((__v2df) __B,
+                                                                (__v2df)
+                                                                __A));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpneqsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnlt_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpnltsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnle_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpnlesd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpngt_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
+                                        (__v2df)
+                                        __builtin_ia32_cmpnltsd ((__v2df) __B,
+                                                                 (__v2df)
+                                                                 __A));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnge_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_movsd ((__v2df) __A,
+                                        (__v2df)
+                                        __builtin_ia32_cmpnlesd ((__v2df) __B,
+                                                                 (__v2df)
+                                                                 __A));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpord_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpordsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpunord_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_cmpunordsd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comieq_sd (__m128d __A, __m128d __B)
+{
+  return __builtin_ia32_comisdeq ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comilt_sd (__m128d __A, __m128d __B)
+{
+  return __builtin_ia32_comisdlt ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comile_sd (__m128d __A, __m128d __B)
+{
+  return __builtin_ia32_comisdle ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comigt_sd (__m128d __A, __m128d __B)
+{
+  return __builtin_ia32_comisdgt ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comige_sd (__m128d __A, __m128d __B)
+{
+  return __builtin_ia32_comisdge ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comineq_sd (__m128d __A, __m128d __B)
+{
+  return __builtin_ia32_comisdneq ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomieq_sd (__m128d __A, __m128d __B)
+{
+  return __builtin_ia32_ucomisdeq ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomilt_sd (__m128d __A, __m128d __B)
+{
+  return __builtin_ia32_ucomisdlt ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomile_sd (__m128d __A, __m128d __B)
+{
+  return __builtin_ia32_ucomisdle ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomigt_sd (__m128d __A, __m128d __B)
+{
+  return __builtin_ia32_ucomisdgt ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomige_sd (__m128d __A, __m128d __B)
+{
+  return __builtin_ia32_ucomisdge ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomineq_sd (__m128d __A, __m128d __B)
+{
+  return __builtin_ia32_ucomisdneq ((__v2df)__A, (__v2df)__B);
+}
+
+/* Create a vector of Qi, where i is the element number.  */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_epi64x (long long __q1, long long __q0)
+{
+  return __extension__ (__m128i)(__v2di){ __q0, __q1 };
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_epi64 (__m64 __q1,  __m64 __q0)
+{
+  return _mm_set_epi64x ((long long)__q1, (long long)__q0);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
+{
+  return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
+              short __q3, short __q2, short __q1, short __q0)
+{
+  return __extension__ (__m128i)(__v8hi){
+    __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
+             char __q11, char __q10, char __q09, char __q08,
+             char __q07, char __q06, char __q05, char __q04,
+             char __q03, char __q02, char __q01, char __q00)
+{
+  return __extension__ (__m128i)(__v16qi){
+    __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
+    __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
+  };
+}
+
+/* Set all of the elements of the vector to A.  */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_epi64x (long long __A)
+{
+  return _mm_set_epi64x (__A, __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_epi64 (__m64 __A)
+{
+  return _mm_set_epi64 (__A, __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_epi32 (int __A)
+{
+  return _mm_set_epi32 (__A, __A, __A, __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_epi16 (short __A)
+{
+  return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_epi8 (char __A)
+{
+  return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
+                      __A, __A, __A, __A, __A, __A, __A, __A);
+}
+
+/* Create a vector of Qi, where i is the element number.
+   The parameter order is reversed from the _mm_set_epi* functions.  */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_epi64 (__m64 __q0, __m64 __q1)
+{
+  return _mm_set_epi64 (__q1, __q0);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
+{
+  return _mm_set_epi32 (__q3, __q2, __q1, __q0);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
+               short __q4, short __q5, short __q6, short __q7)
+{
+  return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
+              char __q04, char __q05, char __q06, char __q07,
+              char __q08, char __q09, char __q10, char __q11,
+              char __q12, char __q13, char __q14, char __q15)
+{
+  return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
+                      __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
+}
+
+/* Create a vector with element 0 as *P and the rest zero.  */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_si128 (__m128i const *__P)
+{
+  return *__P;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_si128 (__m128i_u const *__P)
+{
+  return *__P;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadl_epi64 (__m128i_u const *__P)
+{
+  return _mm_set_epi64 ((__m64)0LL, *(__m64_u *)__P);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_si64 (void const *__P)
+{
+  return _mm_loadl_epi64 ((__m128i_u *)__P);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_si32 (void const *__P)
+{
+  return _mm_set_epi32 (0, 0, 0, (*(__m32_u *)__P)[0]);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_si16 (void const *__P)
+{
+  return _mm_set_epi16 (0, 0, 0, 0, 0, 0, 0, (*(__m16_u *)__P)[0]);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_si128 (__m128i *__P, __m128i __B)
+{
+  *__P = __B;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_si128 (__m128i_u *__P, __m128i __B)
+{
+  *__P = __B;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storel_epi64 (__m128i_u *__P, __m128i __B)
+{
+  *(__m64_u *)__P = (__m64) ((__v2di)__B)[0];
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_si64 (void *__P, __m128i __B)
+{
+  _mm_storel_epi64 ((__m128i_u *)__P, __B);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_si32 (void *__P, __m128i __B)
+{
+  *(__m32_u *)__P = (__m32) ((__v4si)__B)[0];
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_si16 (void *__P, __m128i __B)
+{
+  *(__m16_u *)__P = (__m16) ((__v8hi)__B)[0];
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movepi64_pi64 (__m128i __B)
+{
+  return (__m64) ((__v2di)__B)[0];
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movpi64_epi64 (__m64 __A)
+{
+  return _mm_set_epi64 ((__m64)0LL, __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_move_epi64 (__m128i __A)
+{
+  return (__m128i)__builtin_ia32_movq128 ((__v2di) __A);
+}
+
+/* Create an undefined vector.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_undefined_si128 (void)
+{
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Winit-self"
+  __m128i __Y = __Y;
+#pragma GCC diagnostic pop
+  return __Y;
+}
+
+/* Create a vector of zeros.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setzero_si128 (void)
+{
+  return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi32_pd (__m128i __A)
+{
+  return (__m128d)__builtin_ia32_cvtdq2pd ((__v4si) __A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi32_ps (__m128i __A)
+{
+  return (__m128)__builtin_ia32_cvtdq2ps ((__v4si) __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpd_epi32 (__m128d __A)
+{
+  return (__m128i)__builtin_ia32_cvtpd2dq ((__v2df) __A);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpd_pi32 (__m128d __A)
+{
+  return (__m64)__builtin_ia32_cvtpd2pi ((__v2df) __A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpd_ps (__m128d __A)
+{
+  return (__m128)__builtin_ia32_cvtpd2ps ((__v2df) __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttpd_epi32 (__m128d __A)
+{
+  return (__m128i)__builtin_ia32_cvttpd2dq ((__v2df) __A);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttpd_pi32 (__m128d __A)
+{
+  return (__m64)__builtin_ia32_cvttpd2pi ((__v2df) __A);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpi32_pd (__m64 __A)
+{
+  return (__m128d)__builtin_ia32_cvtpi2pd ((__v2si) __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_epi32 (__m128 __A)
+{
+  return (__m128i)__builtin_ia32_cvtps2dq ((__v4sf) __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttps_epi32 (__m128 __A)
+{
+  return (__m128i)__builtin_ia32_cvttps2dq ((__v4sf) __A);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_pd (__m128 __A)
+{
+  return (__m128d)__builtin_ia32_cvtps2pd ((__v4sf) __A);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_si32 (__m128d __A)
+{
+  return __builtin_ia32_cvtsd2si ((__v2df) __A);
+}
+
+#ifdef __x86_64__
+/* Intel intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_si64 (__m128d __A)
+{
+  return __builtin_ia32_cvtsd2si64 ((__v2df) __A);
+}
+
+/* Microsoft intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_si64x (__m128d __A)
+{
+  return __builtin_ia32_cvtsd2si64 ((__v2df) __A);
+}
+#endif
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsd_si32 (__m128d __A)
+{
+  return __builtin_ia32_cvttsd2si ((__v2df) __A);
+}
+
+#ifdef __x86_64__
+/* Intel intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsd_si64 (__m128d __A)
+{
+  return __builtin_ia32_cvttsd2si64 ((__v2df) __A);
+}
+
+/* Microsoft intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttsd_si64x (__m128d __A)
+{
+  return __builtin_ia32_cvttsd2si64 ((__v2df) __A);
+}
+#endif
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsd_ss (__m128 __A, __m128d __B)
+{
+  return (__m128)__builtin_ia32_cvtsd2ss ((__v4sf) __A, (__v2df) __B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi32_sd (__m128d __A, int __B)
+{
+  return (__m128d)__builtin_ia32_cvtsi2sd ((__v2df) __A, __B);
+}
+
+#ifdef __x86_64__
+/* Intel intrinsic.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64_sd (__m128d __A, long long __B)
+{
+  return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B);
+}
+
+/* Microsoft intrinsic.  */
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64x_sd (__m128d __A, long long __B)
+{
+  return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B);
+}
+#endif
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_sd (__m128d __A, __m128 __B)
+{
+  return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
+{
+  return (__m128d)__builtin_ia32_shufpd ((__v2df)__A, (__v2df)__B, __mask);
+}
+#else
+#define _mm_shuffle_pd(A, B, N)                                                \
+  ((__m128d)__builtin_ia32_shufpd ((__v2df)(__m128d)(A),               \
+                                  (__v2df)(__m128d)(B), (int)(N)))
+#endif
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_pd (__m128d __A, __m128d __B)
+{
+  return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadh_pd (__m128d __A, double const *__B)
+{
+  return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, __B);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadl_pd (__m128d __A, double const *__B)
+{
+  return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, __B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movemask_pd (__m128d __A)
+{
+  return __builtin_ia32_movmskpd ((__v2df)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packs_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_packsswb128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packs_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_packssdw128 ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packus_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_packuswb128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_punpckhbw128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_punpckhwd128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_punpckhdq128 ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_punpckhqdq128 ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_punpcklbw128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_punpcklwd128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_punpckldq128 ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v16qu)__A + (__v16qu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v8hu)__A + (__v8hu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v4su)__A + (__v4su)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v2du)__A + (__v2du)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_paddsw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_epu8 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_paddusb128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_epu16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v16qu)__A - (__v16qu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v8hu)__A - (__v8hu)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v4su)__A - (__v4su)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v2du)__A - (__v2du)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_subs_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_subs_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psubsw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_subs_epu8 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psubusb128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_subs_epu16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psubusw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_madd_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pmaddwd128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhi_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mullo_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v8hu)__A * (__v8hu)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_su32 (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_epu32 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_epi16 (__m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_epi32 (__m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_epi64 (__m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srai_epi16 (__m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srai_epi32 (__m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_bsrli_si128 (__m128i __A, const int __N)
+{
+  return (__m128i)__builtin_ia32_psrldqi128 (__A, __N * 8);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_bslli_si128 (__m128i __A, const int __N)
+{
+  return (__m128i)__builtin_ia32_pslldqi128 (__A, __N * 8);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_si128 (__m128i __A, const int __N)
+{
+  return (__m128i)__builtin_ia32_psrldqi128 (__A, __N * 8);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_si128 (__m128i __A, const int __N)
+{
+  return (__m128i)__builtin_ia32_pslldqi128 (__A, __N * 8);
+}
+#else
+#define _mm_bsrli_si128(A, N) \
+  ((__m128i)__builtin_ia32_psrldqi128 ((__m128i)(A), (int)(N) * 8))
+#define _mm_bslli_si128(A, N) \
+  ((__m128i)__builtin_ia32_pslldqi128 ((__m128i)(A), (int)(N) * 8))
+#define _mm_srli_si128(A, N) \
+  ((__m128i)__builtin_ia32_psrldqi128 ((__m128i)(A), (int)(N) * 8))
+#define _mm_slli_si128(A, N) \
+  ((__m128i)__builtin_ia32_pslldqi128 ((__m128i)(A), (int)(N) * 8))
+#endif
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_epi16 (__m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_epi32 (__m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_epi64 (__m128i __A, int __B)
+{
+  return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psllw128((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pslld128((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psllq128((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sra_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psraw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sra_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psrad128 ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psrlw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psrld128 ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_epi64 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psrlq128 ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_and_si128 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v2du)__A & (__v2du)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_andnot_si128 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pandn128 ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_or_si128 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v2du)__A | (__v2du)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_xor_si128 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v2du)__A ^ (__v2du)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v16qi)__A == (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v8hi)__A == (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v4si)__A == (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v16qs)__A < (__v16qs)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v8hi)__A < (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v4si)__A < (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v16qs)__A > (__v16qs)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v8hi)__A > (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) ((__v4si)__A > (__v4si)__B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_epi16 (__m128i const __A, int const __N)
+{
+  return (unsigned short) __builtin_ia32_vec_ext_v8hi ((__v8hi)__A, __N);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
+{
+  return (__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)__A, __D, __N);
+}
+#else
+#define _mm_extract_epi16(A, N) \
+  ((int) (unsigned short) __builtin_ia32_vec_ext_v8hi ((__v8hi)(__m128i)(A), (int)(N)))
+#define _mm_insert_epi16(A, D, N)                              \
+  ((__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)(__m128i)(A),        \
+                                         (int)(D), (int)(N)))
+#endif
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pmaxsw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epu8 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pmaxub128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epi16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pminsw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epu8 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pminub128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movemask_epi8 (__m128i __A)
+{
+  return __builtin_ia32_pmovmskb128 ((__v16qi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhi_epu16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pmulhuw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shufflehi_epi16 (__m128i __A, const int __mask)
+{
+  return (__m128i)__builtin_ia32_pshufhw ((__v8hi)__A, __mask);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shufflelo_epi16 (__m128i __A, const int __mask)
+{
+  return (__m128i)__builtin_ia32_pshuflw ((__v8hi)__A, __mask);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_epi32 (__m128i __A, const int __mask)
+{
+  return (__m128i)__builtin_ia32_pshufd ((__v4si)__A, __mask);
+}
+#else
+#define _mm_shufflehi_epi16(A, N) \
+  ((__m128i)__builtin_ia32_pshufhw ((__v8hi)(__m128i)(A), (int)(N)))
+#define _mm_shufflelo_epi16(A, N) \
+  ((__m128i)__builtin_ia32_pshuflw ((__v8hi)(__m128i)(A), (int)(N)))
+#define _mm_shuffle_epi32(A, N) \
+  ((__m128i)__builtin_ia32_pshufd ((__v4si)(__m128i)(A), (int)(N)))
+#endif
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
+{
+  __builtin_ia32_maskmovdqu ((__v16qi)__A, (__v16qi)__B, __C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avg_epu8 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pavgb128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avg_epu16 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_pavgw128 ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sad_epu8 (__m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_psadbw128 ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_si32 (int *__A, int __B)
+{
+  __builtin_ia32_movnti (__A, __B);
+}
+
+#ifdef __x86_64__
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_si64 (long long int *__A, long long int __B)
+{
+  __builtin_ia32_movnti64 (__A, __B);
+}
+#endif
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_si128 (__m128i *__A, __m128i __B)
+{
+  __builtin_ia32_movntdq ((__v2di *)__A, (__v2di)__B);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_pd (double *__A, __m128d __B)
+{
+  __builtin_ia32_movntpd (__A, (__v2df)__B);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_clflush (void const *__A)
+{
+  __builtin_ia32_clflush (__A);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_lfence (void)
+{
+  __builtin_ia32_lfence ();
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mfence (void)
+{
+  __builtin_ia32_mfence ();
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi32_si128 (int __A)
+{
+  return _mm_set_epi32 (0, 0, 0, __A);
+}
+
+#ifdef __x86_64__
+/* Intel intrinsic.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64_si128 (long long __A)
+{
+  return _mm_set_epi64x (0, __A);
+}
+
+/* Microsoft intrinsic.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64x_si128 (long long __A)
+{
+  return _mm_set_epi64x (0, __A);
+}
+#endif
+
+/* Casts between various SP, DP, INT vector types.  Note that these do no
+   conversion of values, they just change the type.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castpd_ps(__m128d __A)
+{
+  return (__m128) __A;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castpd_si128(__m128d __A)
+{
+  return (__m128i) __A;
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castps_pd(__m128 __A)
+{
+  return (__m128d) __A;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castps_si128(__m128 __A)
+{
+  return (__m128i) __A;
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castsi128_ps(__m128i __A)
+{
+  return (__m128) __A;
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_castsi128_pd(__m128i __A)
+{
+  return (__m128d) __A;
+}
+
+#ifdef __DISABLE_SSE2__
+#undef __DISABLE_SSE2__
+#pragma GCC pop_options
+#endif /* __DISABLE_SSE2__ */
+
+#endif /* _EMMINTRIN_H_INCLUDED */
diff --git a/include-gcc/enqcmdintrin.h b/include-gcc/enqcmdintrin.h
new file mode 100644 (file)
index 0000000..59682e2
--- /dev/null
@@ -0,0 +1,55 @@
+/* Copyright (C) 2019-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <enqcmdintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _ENQCMDINTRIN_H_INCLUDED
+#define _ENQCMDINTRIN_H_INCLUDED
+
+#ifndef __ENQCMD__
+#pragma GCC push_options
+#pragma GCC target ("enqcmd")
+#define __DISABLE_ENQCMD__
+#endif /* __ENQCMD__ */
+
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_enqcmd (void * __P, const void * __Q)
+{
+  return __builtin_ia32_enqcmd (__P, __Q);
+}
+
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_enqcmds (void * __P, const void * __Q)
+{
+  return __builtin_ia32_enqcmds (__P, __Q);
+}
+
+#ifdef __DISABLE_ENQCMD__
+#undef __DISABLE_ENQCMD__
+#pragma GCC pop_options
+#endif /* __DISABLE_ENQCMD__ */
+#endif /* _ENQCMDINTRIN_H_INCLUDED.  */
diff --git a/include-gcc/f16cintrin.h b/include-gcc/f16cintrin.h
new file mode 100644 (file)
index 0000000..72c7c23
--- /dev/null
@@ -0,0 +1,98 @@
+/* Copyright (C) 2011-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED
+# error "Never use <f16intrin.h> directly; include <x86intrin.h> or <immintrin.h> instead."
+#endif
+
+#ifndef _F16CINTRIN_H_INCLUDED
+#define _F16CINTRIN_H_INCLUDED
+
+#ifndef __F16C__
+#pragma GCC push_options
+#pragma GCC target("f16c")
+#define __DISABLE_F16C__
+#endif /* __F16C__ */
+
+extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_cvtsh_ss (unsigned short __S)
+{
+  __v8hi __H = __extension__ (__v8hi){ (short) __S, 0, 0, 0, 0, 0, 0, 0 };
+  __v4sf __A = __builtin_ia32_vcvtph2ps (__H);
+  return __builtin_ia32_vec_ext_v4sf (__A, 0);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtph_ps (__m128i __A)
+{
+  return (__m128) __builtin_ia32_vcvtph2ps ((__v8hi) __A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtph_ps (__m128i __A)
+{
+  return (__m256) __builtin_ia32_vcvtph2ps256 ((__v8hi) __A);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_cvtss_sh (float __F, const int __I)
+{
+  __v4sf __A =  __extension__ (__v4sf){ __F, 0, 0, 0 };
+  __v8hi __H = __builtin_ia32_vcvtps2ph (__A, __I);
+  return (unsigned short) __builtin_ia32_vec_ext_v8hi (__H, 0);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_ph (__m128 __A, const int __I)
+{
+  return (__m128i) __builtin_ia32_vcvtps2ph ((__v4sf) __A, __I);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtps_ph (__m256 __A, const int __I)
+{
+  return (__m128i) __builtin_ia32_vcvtps2ph256 ((__v8sf) __A, __I);
+}
+#else
+#define _cvtss_sh(__F, __I)                                            \
+  (__extension__                                                       \
+   ({                                                                  \
+      __v4sf __A =  __extension__ (__v4sf){ __F, 0, 0, 0 };            \
+      __v8hi __H = __builtin_ia32_vcvtps2ph (__A, __I);                        \
+      (unsigned short) __builtin_ia32_vec_ext_v8hi (__H, 0);           \
+    }))
+
+#define _mm_cvtps_ph(A, I) \
+  ((__m128i) __builtin_ia32_vcvtps2ph ((__v4sf)(__m128) (A), (int) (I)))
+
+#define _mm256_cvtps_ph(A, I) \
+  ((__m128i) __builtin_ia32_vcvtps2ph256 ((__v8sf)(__m256) (A), (int) (I)))
+#endif /* __OPTIMIZE */
+
+#ifdef __DISABLE_F16C__
+#undef __DISABLE_F16C__
+#pragma GCC pop_options
+#endif /* __DISABLE_F16C__ */
+
+#endif /* _F16CINTRIN_H_INCLUDED */
diff --git a/include-gcc/fma4intrin.h b/include-gcc/fma4intrin.h
new file mode 100644 (file)
index 0000000..e43a91f
--- /dev/null
@@ -0,0 +1,241 @@
+/* Copyright (C) 2007-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86INTRIN_H_INCLUDED
+# error "Never use <fma4intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _FMA4INTRIN_H_INCLUDED
+#define _FMA4INTRIN_H_INCLUDED
+
+/* We need definitions from the SSE4A, SSE3, SSE2 and SSE header files.  */
+#include <ammintrin.h>
+
+#ifndef __FMA4__
+#pragma GCC push_options
+#pragma GCC target("fma4")
+#define __DISABLE_FMA4__
+#endif /* __FMA4__ */
+
+/* 128b Floating point multiply/add type instructions.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_macc_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_macc_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_macc_ss (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddss ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_macc_sd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_msub_ps (__m128 __A, __m128 __B, __m128 __C)
+
+{
+  return (__m128) __builtin_ia32_vfmaddps ((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_msub_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd ((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_msub_ss (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddss ((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_msub_sd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd ((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmacc_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps (-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmacc_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd (-(__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmacc_ss (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddss (-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmacc_sd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd (-(__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmsub_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddps (-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmsub_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddpd (-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmsub_ss (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddss (-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_nmsub_sd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd (-(__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maddsub_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maddsub_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd ((__v2df)__A, (__v2df)__B, (__v2df)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_msubadd_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddsubps ((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_msubadd_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsubpd ((__v2df)__A, (__v2df)__B, -(__v2df)__C);
+}
+
+/* 256b Floating point multiply/add type instructions.  */
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_macc_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_macc_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_msub_ps (__m256 __A, __m256 __B, __m256 __C)
+
+{
+  return (__m256) __builtin_ia32_vfmaddps256 ((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_msub_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256 ((__v4df)__A, (__v4df)__B, -(__v4df)__C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_nmacc_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256 (-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_nmacc_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256 (-(__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_nmsub_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddps256 (-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_nmsub_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddpd256 (-(__v4df)__A, (__v4df)__B, -(__v4df)__C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maddsub_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maddsub_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_msubadd_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256) __builtin_ia32_vfmaddsubps256 ((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_msubadd_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d) __builtin_ia32_vfmaddsubpd256 ((__v4df)__A, (__v4df)__B, -(__v4df)__C);
+}
+
+#ifdef __DISABLE_FMA4__
+#undef __DISABLE_FMA4__
+#pragma GCC pop_options
+#endif /* __DISABLE_FMA4__ */
+
+#endif
diff --git a/include-gcc/fmaintrin.h b/include-gcc/fmaintrin.h
new file mode 100644 (file)
index 0000000..f5d643e
--- /dev/null
@@ -0,0 +1,302 @@
+/* Copyright (C) 2011-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+# error "Never use <fmaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _FMAINTRIN_H_INCLUDED
+#define _FMAINTRIN_H_INCLUDED
+
+#ifndef __FMA__
+#pragma GCC push_options
+#pragma GCC target("fma")
+#define __DISABLE_FMA__
+#endif /* __FMA__ */
+
+extern __inline __m128d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddpd ((__v2df)__A, (__v2df)__B,
+                                           (__v2df)__C);
+}
+
+extern __inline __m256d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmadd_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddpd256 ((__v4df)__A, (__v4df)__B,
+                                              (__v4df)__C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddps ((__v4sf)__A, (__v4sf)__B,
+                                          (__v4sf)__C);
+}
+
+extern __inline __m256
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmadd_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddps256 ((__v8sf)__A, (__v8sf)__B,
+                                             (__v8sf)__C);
+}
+
+extern __inline __m128d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_sd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d) __builtin_ia32_vfmaddsd3 ((__v2df)__A, (__v2df)__B,
+                                             (__v2df)__C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmadd_ss (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128) __builtin_ia32_vfmaddss3 ((__v4sf)__A, (__v4sf)__B,
+                                            (__v4sf)__C);
+}
+
+extern __inline __m128d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmsub_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubpd ((__v2df)__A, (__v2df)__B,
+                                           (__v2df)__C);
+}
+
+extern __inline __m256d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmsub_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmsubpd256 ((__v4df)__A, (__v4df)__B,
+                                              (__v4df)__C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmsub_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubps ((__v4sf)__A, (__v4sf)__B,
+                                          (__v4sf)__C);
+}
+
+extern __inline __m256
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmsub_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmsubps256 ((__v8sf)__A, (__v8sf)__B,
+                                             (__v8sf)__C);
+}
+
+extern __inline __m128d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmsub_sd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmsubsd3 ((__v2df)__A, (__v2df)__B,
+                                            (__v2df)__C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmsub_ss (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmsubss3 ((__v4sf)__A, (__v4sf)__B,
+                                           (__v4sf)__C);
+}
+
+extern __inline __m128d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmadd_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmaddpd ((__v2df)__A, (__v2df)__B,
+                                           (__v2df)__C);
+}
+
+extern __inline __m256d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fnmadd_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfnmaddpd256 ((__v4df)__A, (__v4df)__B,
+                                              (__v4df)__C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmadd_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmaddps ((__v4sf)__A, (__v4sf)__B,
+                                          (__v4sf)__C);
+}
+
+extern __inline __m256
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fnmadd_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfnmaddps256 ((__v8sf)__A, (__v8sf)__B,
+                                             (__v8sf)__C);
+}
+
+extern __inline __m128d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmadd_sd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmaddsd3 ((__v2df)__A, (__v2df)__B,
+                                            (__v2df)__C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmadd_ss (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmaddss3 ((__v4sf)__A, (__v4sf)__B,
+                                           (__v4sf)__C);
+}
+
+extern __inline __m128d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmsub_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmsubpd ((__v2df)__A, (__v2df)__B,
+                                           (__v2df)__C);
+}
+
+extern __inline __m256d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fnmsub_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfnmsubpd256 ((__v4df)__A, (__v4df)__B,
+                                              (__v4df)__C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmsub_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmsubps ((__v4sf)__A, (__v4sf)__B,
+                                          (__v4sf)__C);
+}
+
+extern __inline __m256
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fnmsub_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfnmsubps256 ((__v8sf)__A, (__v8sf)__B,
+                                             (__v8sf)__C);
+}
+
+extern __inline __m128d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmsub_sd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfnmsubsd3 ((__v2df)__A, (__v2df)__B,
+                                            (__v2df)__C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fnmsub_ss (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfnmsubss3 ((__v4sf)__A, (__v4sf)__B,
+                                           (__v4sf)__C);
+}
+
+extern __inline __m128d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmaddsub_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsubpd ((__v2df)__A, (__v2df)__B,
+                                              (__v2df)__C);
+}
+
+extern __inline __m256d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmaddsub_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddsubpd256 ((__v4df)__A,
+                                                 (__v4df)__B,
+                                                 (__v4df)__C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmaddsub_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddsubps ((__v4sf)__A, (__v4sf)__B,
+                                             (__v4sf)__C);
+}
+
+extern __inline __m256
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmaddsub_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddsubps256 ((__v8sf)__A,
+                                                (__v8sf)__B,
+                                                (__v8sf)__C);
+}
+
+extern __inline __m128d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmsubadd_pd (__m128d __A, __m128d __B, __m128d __C)
+{
+  return (__m128d)__builtin_ia32_vfmaddsubpd ((__v2df)__A, (__v2df)__B,
+                                              -(__v2df)__C);
+}
+
+extern __inline __m256d
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmsubadd_pd (__m256d __A, __m256d __B, __m256d __C)
+{
+  return (__m256d)__builtin_ia32_vfmaddsubpd256 ((__v4df)__A,
+                                                 (__v4df)__B,
+                                                 -(__v4df)__C);
+}
+
+extern __inline __m128
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_fmsubadd_ps (__m128 __A, __m128 __B, __m128 __C)
+{
+  return (__m128)__builtin_ia32_vfmaddsubps ((__v4sf)__A, (__v4sf)__B,
+                                             -(__v4sf)__C);
+}
+
+extern __inline __m256
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_fmsubadd_ps (__m256 __A, __m256 __B, __m256 __C)
+{
+  return (__m256)__builtin_ia32_vfmaddsubps256 ((__v8sf)__A,
+                                                (__v8sf)__B,
+                                                -(__v8sf)__C);
+}
+
+#ifdef __DISABLE_FMA__
+#undef __DISABLE_FMA__
+#pragma GCC pop_options
+#endif /* __DISABLE_FMA__ */
+
+#endif
diff --git a/include-gcc/fxsrintrin.h b/include-gcc/fxsrintrin.h
new file mode 100644 (file)
index 0000000..26506a6
--- /dev/null
@@ -0,0 +1,73 @@
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <fxsrintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _FXSRINTRIN_H_INCLUDED
+#define _FXSRINTRIN_H_INCLUDED
+
+#ifndef __FXSR__
+#pragma GCC push_options
+#pragma GCC target("fxsr")
+#define __DISABLE_FXSR__
+#endif /* __FXSR__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_fxsave (void *__P)
+{
+  __builtin_ia32_fxsave (__P);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_fxrstor (void *__P)
+{
+  __builtin_ia32_fxrstor (__P);
+}
+
+#ifdef __x86_64__
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_fxsave64 (void *__P)
+{
+  __builtin_ia32_fxsave64 (__P);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_fxrstor64 (void *__P)
+{
+  __builtin_ia32_fxrstor64 (__P);
+}
+#endif
+
+#ifdef __DISABLE_FXSR__
+#undef __DISABLE_FXSR__
+#pragma GCC pop_options
+#endif /* __DISABLE_FXSR__ */
+
+
+#endif /* _FXSRINTRIN_H_INCLUDED */
diff --git a/include-gcc/gfniintrin.h b/include-gcc/gfniintrin.h
new file mode 100644 (file)
index 0000000..ef3dc22
--- /dev/null
@@ -0,0 +1,414 @@
+/* Copyright (C) 2017-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <gfniintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _GFNIINTRIN_H_INCLUDED
+#define _GFNIINTRIN_H_INCLUDED
+
+#if !defined(__GFNI__) || !defined(__SSE2__)
+#pragma GCC push_options
+#pragma GCC target("gfni,sse2")
+#define __DISABLE_GFNI__
+#endif /* __GFNI__ */
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_gf2p8mul_epi8 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi((__v16qi) __A,
+                                                  (__v16qi) __B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_gf2p8affineinv_epi64_epi8 (__m128i __A, __m128i __B, const int __C)
+{
+  return (__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi ((__v16qi) __A,
+                                                          (__v16qi) __B,
+                                                           __C);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_gf2p8affine_epi64_epi8 (__m128i __A, __m128i __B, const int __C)
+{
+  return (__m128i) __builtin_ia32_vgf2p8affineqb_v16qi ((__v16qi) __A,
+                                                       (__v16qi) __B, __C);
+}
+#else
+#define _mm_gf2p8affineinv_epi64_epi8(A, B, C)                            \
+  ((__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \
+                                          (__v16qi)(__m128i)(B), (int)(C)))
+#define _mm_gf2p8affine_epi64_epi8(A, B, C)                               \
+  ((__m128i) __builtin_ia32_vgf2p8affineqb_v16qi ((__v16qi)(__m128i)(A),   \
+                                          (__v16qi)(__m128i)(B), (int)(C)))
+#endif
+
+#ifdef __DISABLE_GFNI__
+#undef __DISABLE_GFNI__
+#pragma GCC pop_options
+#endif /* __DISABLE_GFNI__ */
+
+#if !defined(__GFNI__) || !defined(__AVX__)
+#pragma GCC push_options
+#pragma GCC target("gfni,avx")
+#define __DISABLE_GFNIAVX__
+#endif /* __GFNIAVX__ */
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_gf2p8mul_epi8 (__m256i __A, __m256i __B)
+{
+  return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi ((__v32qi) __A,
+                                                   (__v32qi) __B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_gf2p8affineinv_epi64_epi8 (__m256i __A, __m256i __B, const int __C)
+{
+  return (__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi ((__v32qi) __A,
+                                                          (__v32qi) __B,
+                                                           __C);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_gf2p8affine_epi64_epi8 (__m256i __A, __m256i __B, const int __C)
+{
+  return (__m256i) __builtin_ia32_vgf2p8affineqb_v32qi ((__v32qi) __A,
+                                                       (__v32qi) __B, __C);
+}
+#else
+#define _mm256_gf2p8affineinv_epi64_epi8(A, B, C)                         \
+  ((__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), \
+                                                   (__v32qi)(__m256i)(B), \
+                                                   (int)(C)))
+#define _mm256_gf2p8affine_epi64_epi8(A, B, C)                            \
+  ((__m256i) __builtin_ia32_vgf2p8affineqb_v32qi ((__v32qi)(__m256i)(A),   \
+                                       (   __v32qi)(__m256i)(B), (int)(C)))
+#endif
+
+#ifdef __DISABLE_GFNIAVX__
+#undef __DISABLE_GFNIAVX__
+#pragma GCC pop_options
+#endif /* __GFNIAVX__ */
+
+#if !defined(__GFNI__) || !defined(__AVX512VL__)
+#pragma GCC push_options
+#pragma GCC target("gfni,avx512vl")
+#define __DISABLE_GFNIAVX512VL__
+#endif /* __GFNIAVX512VL__ */
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_gf2p8mul_epi8 (__m128i __A, __mmask16 __B, __m128i __C, __m128i __D)
+{
+  return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi_mask ((__v16qi) __C,
+                                                        (__v16qi) __D,
+                                                        (__v16qi)__A, __B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_gf2p8mul_epi8 (__mmask16 __A, __m128i __B, __m128i __C)
+{
+  return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi_mask ((__v16qi) __B,
+                       (__v16qi) __C, (__v16qi) _mm_setzero_si128 (), __A);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_gf2p8affineinv_epi64_epi8 (__m128i __A, __mmask16 __B, __m128i __C,
+                                   __m128i __D, const int __E)
+{
+  return (__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi_mask ((__v16qi) __C,
+                                                               (__v16qi) __D,
+                                                                __E,
+                                                               (__v16qi)__A,
+                                                                __B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_gf2p8affineinv_epi64_epi8 (__mmask16 __A, __m128i __B, __m128i __C,
+                                    const int __D)
+{
+  return (__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi_mask ((__v16qi) __B,
+                                               (__v16qi) __C, __D,
+                                               (__v16qi) _mm_setzero_si128 (),
+                                                __A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mask_gf2p8affine_epi64_epi8 (__m128i __A, __mmask16 __B, __m128i __C,
+                                __m128i __D, const int __E)
+{
+  return (__m128i) __builtin_ia32_vgf2p8affineqb_v16qi_mask ((__v16qi) __C,
+                                       (__v16qi) __D, __E, (__v16qi)__A, __B);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskz_gf2p8affine_epi64_epi8 (__mmask16 __A, __m128i __B, __m128i __C,
+                                 const int __D)
+{
+  return (__m128i) __builtin_ia32_vgf2p8affineqb_v16qi_mask ((__v16qi) __B,
+                    (__v16qi) __C, __D, (__v16qi) _mm_setzero_si128 (), __A);
+}
+#else
+#define _mm_mask_gf2p8affineinv_epi64_epi8(A, B, C, D, E)                 \
+  ((__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi_mask(                 \
+                       (__v16qi)(__m128i)(C), (__v16qi)(__m128i)(D),      \
+                       (int)(E), (__v16qi)(__m128i)(A), (__mmask16)(B)))
+#define _mm_maskz_gf2p8affineinv_epi64_epi8(A, B, C, D) \
+  ((__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi_mask(                 \
+                       (__v16qi)(__m128i)(B), (__v16qi)(__m128i)(C),      \
+                       (int)(D), (__v16qi)(__m128i) _mm_setzero_si128 (), \
+                       (__mmask16)(A)))
+#define _mm_mask_gf2p8affine_epi64_epi8(A, B, C, D, E) \
+  ((__m128i) __builtin_ia32_vgf2p8affineqb_v16qi_mask((__v16qi)(__m128i)(C),\
+      (__v16qi)(__m128i)(D), (int)(E), (__v16qi)(__m128i)(A), (__mmask16)(B)))
+#define _mm_maskz_gf2p8affine_epi64_epi8(A, B, C, D)                       \
+  ((__m128i) __builtin_ia32_vgf2p8affineqb_v16qi_mask((__v16qi)(__m128i)(B),\
+               (__v16qi)(__m128i)(C), (int)(D),                            \
+               (__v16qi)(__m128i) _mm_setzero_si128 (), (__mmask16)(A)))
+#endif
+
+#ifdef __DISABLE_GFNIAVX512VL__
+#undef __DISABLE_GFNIAVX512VL__
+#pragma GCC pop_options
+#endif /* __GFNIAVX512VL__ */
+
+#if !defined(__GFNI__) || !defined(__AVX512VL__) || !defined(__AVX512BW__)
+#pragma GCC push_options
+#pragma GCC target("gfni,avx512vl,avx512bw")
+#define __DISABLE_GFNIAVX512VLBW__
+#endif /* __GFNIAVX512VLBW__ */
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_gf2p8mul_epi8 (__m256i __A, __mmask32 __B, __m256i __C,
+                          __m256i __D)
+{
+  return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi_mask ((__v32qi) __C,
+                                                        (__v32qi) __D,
+                                                        (__v32qi)__A, __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_gf2p8mul_epi8 (__mmask32 __A, __m256i __B, __m256i __C)
+{
+  return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi_mask ((__v32qi) __B,
+                       (__v32qi) __C, (__v32qi) _mm256_setzero_si256 (), __A);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_gf2p8affineinv_epi64_epi8 (__m256i __A, __mmask32 __B,
+                                      __m256i __C, __m256i __D, const int __E)
+{
+  return (__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi_mask ((__v32qi) __C,
+                                                               (__v32qi) __D,
+                                                                __E,
+                                                               (__v32qi)__A,
+                                                                __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_gf2p8affineinv_epi64_epi8 (__mmask32 __A, __m256i __B,
+                                       __m256i __C, const int __D)
+{
+  return (__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi_mask ((__v32qi) __B,
+                                     (__v32qi) __C, __D,
+                                     (__v32qi) _mm256_setzero_si256 (), __A);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mask_gf2p8affine_epi64_epi8 (__m256i __A, __mmask32 __B, __m256i __C,
+                                   __m256i __D, const int __E)
+{
+  return (__m256i) __builtin_ia32_vgf2p8affineqb_v32qi_mask ((__v32qi) __C,
+                                                            (__v32qi) __D,
+                                                             __E,
+                                                            (__v32qi)__A,
+                                                             __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskz_gf2p8affine_epi64_epi8 (__mmask32 __A, __m256i __B,
+                                    __m256i __C, const int __D)
+{
+  return (__m256i) __builtin_ia32_vgf2p8affineqb_v32qi_mask ((__v32qi) __B,
+               (__v32qi) __C, __D, (__v32qi)_mm256_setzero_si256 (), __A);
+}
+#else
+#define _mm256_mask_gf2p8affineinv_epi64_epi8(A, B, C, D, E)           \
+  ((__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi_mask(              \
+       (__v32qi)(__m256i)(C), (__v32qi)(__m256i)(D), (int)(E),         \
+       (__v32qi)(__m256i)(A), (__mmask32)(B)))
+#define _mm256_maskz_gf2p8affineinv_epi64_epi8(A, B, C, D)             \
+  ((__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi_mask(              \
+       (__v32qi)(__m256i)(B), (__v32qi)(__m256i)(C), (int)(D),         \
+       (__v32qi)(__m256i) _mm256_setzero_si256 (), (__mmask32)(A)))
+#define _mm256_mask_gf2p8affine_epi64_epi8(A, B, C, D, E)                  \
+  ((__m256i) __builtin_ia32_vgf2p8affineqb_v32qi_mask((__v32qi)(__m256i)(C),\
+       (__v32qi)(__m256i)(D), (int)(E), (__v32qi)(__m256i)(A), (__mmask32)(B)))
+#define _mm256_maskz_gf2p8affine_epi64_epi8(A, B, C, D)                            \
+  ((__m256i) __builtin_ia32_vgf2p8affineqb_v32qi_mask((__v32qi)(__m256i)(B),\
+        (__v32qi)(__m256i)(C), (int)(D),                                   \
+        (__v32qi)(__m256i) _mm256_setzero_si256 (), (__mmask32)(A)))
+#endif
+
+#ifdef __DISABLE_GFNIAVX512VLBW__
+#undef __DISABLE_GFNIAVX512VLBW__
+#pragma GCC pop_options
+#endif /* __GFNIAVX512VLBW__ */
+
+#if !defined(__GFNI__) || !defined(__AVX512F__) || !defined(__AVX512BW__)
+#pragma GCC push_options
+#pragma GCC target("gfni,avx512f,avx512bw")
+#define __DISABLE_GFNIAVX512FBW__
+#endif /* __GFNIAVX512FBW__ */
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_gf2p8mul_epi8 (__m512i __A, __mmask64 __B, __m512i __C,
+                          __m512i __D)
+{
+  return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi_mask ((__v64qi) __C,
+                                       (__v64qi) __D, (__v64qi)__A, __B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_gf2p8mul_epi8 (__mmask64 __A, __m512i __B, __m512i __C)
+{
+  return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi_mask ((__v64qi) __B,
+                       (__v64qi) __C, (__v64qi) _mm512_setzero_si512 (), __A);
+}
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_gf2p8mul_epi8 (__m512i __A, __m512i __B)
+{
+  return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi ((__v64qi) __A,
+                                                   (__v64qi) __B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_gf2p8affineinv_epi64_epi8 (__m512i __A, __mmask64 __B, __m512i __C,
+                                      __m512i __D, const int __E)
+{
+  return (__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi_mask ((__v64qi) __C,
+                                                               (__v64qi) __D,
+                                                                __E,
+                                                               (__v64qi)__A,
+                                                                __B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_gf2p8affineinv_epi64_epi8 (__mmask64 __A, __m512i __B,
+                                       __m512i __C, const int __D)
+{
+  return (__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi_mask ((__v64qi) __B,
+                               (__v64qi) __C, __D,
+                               (__v64qi) _mm512_setzero_si512 (), __A);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_gf2p8affineinv_epi64_epi8 (__m512i __A, __m512i __B, const int __C)
+{
+  return (__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi ((__v64qi) __A,
+                                                          (__v64qi) __B, __C);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_mask_gf2p8affine_epi64_epi8 (__m512i __A, __mmask64 __B, __m512i __C,
+                                   __m512i __D, const int __E)
+{
+  return (__m512i) __builtin_ia32_vgf2p8affineqb_v64qi_mask ((__v64qi) __C,
+                                       (__v64qi) __D, __E, (__v64qi)__A, __B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_maskz_gf2p8affine_epi64_epi8 (__mmask64 __A, __m512i __B, __m512i __C,
+                                    const int __D)
+{
+  return (__m512i) __builtin_ia32_vgf2p8affineqb_v64qi_mask ((__v64qi) __B,
+                 (__v64qi) __C, __D, (__v64qi) _mm512_setzero_si512 (), __A);
+}
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_gf2p8affine_epi64_epi8 (__m512i __A, __m512i __B, const int __C)
+{
+  return (__m512i) __builtin_ia32_vgf2p8affineqb_v64qi ((__v64qi) __A,
+                                                       (__v64qi) __B, __C);
+}
+#else
+#define _mm512_mask_gf2p8affineinv_epi64_epi8(A, B, C, D, E)           \
+  ((__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi_mask(              \
+       (__v64qi)(__m512i)(C), (__v64qi)(__m512i)(D), (int)(E),         \
+       (__v64qi)(__m512i)(A), (__mmask64)(B)))
+#define _mm512_maskz_gf2p8affineinv_epi64_epi8(A, B, C, D)             \
+  ((__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi_mask(              \
+       (__v64qi)(__m512i)(B), (__v64qi)(__m512i)(C), (int)(D),         \
+       (__v64qi)(__m512i) _mm512_setzero_si512 (), (__mmask64)(A)))
+#define _mm512_gf2p8affineinv_epi64_epi8(A, B, C)                      \
+  ((__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi (                  \
+       (__v64qi)(__m512i)(A), (__v64qi)(__m512i)(B), (int)(C)))
+#define _mm512_mask_gf2p8affine_epi64_epi8(A, B, C, D, E)                  \
+  ((__m512i) __builtin_ia32_vgf2p8affineqb_v64qi_mask((__v64qi)(__m512i)(C),\
+     (__v64qi)(__m512i)(D), (int)(E), (__v64qi)(__m512i)(A), (__mmask64)(B)))
+#define _mm512_maskz_gf2p8affine_epi64_epi8(A, B, C, D)                            \
+  ((__m512i) __builtin_ia32_vgf2p8affineqb_v64qi_mask((__v64qi)(__m512i)(B),\
+        (__v64qi)(__m512i)(C), (int)(D),                                   \
+        (__v64qi)(__m512i) _mm512_setzero_si512 (), (__mmask64)(A)))
+#define _mm512_gf2p8affine_epi64_epi8(A, B, C)                             \
+  ((__m512i) __builtin_ia32_vgf2p8affineqb_v64qi ((__v64qi)(__m512i)(A),    \
+        (__v64qi)(__m512i)(B), (int)(C)))
+#endif
+
+#ifdef __DISABLE_GFNIAVX512FBW__
+#undef __DISABLE_GFNIAVX512FBW__
+#pragma GCC pop_options
+#endif /* __GFNIAVX512FBW__ */
+
+#endif /* _GFNIINTRIN_H_INCLUDED */
diff --git a/include-gcc/hresetintrin.h b/include-gcc/hresetintrin.h
new file mode 100644 (file)
index 0000000..7a29665
--- /dev/null
@@ -0,0 +1,48 @@
+/* Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if !defined _X86GPRINTRIN_H_INCLUDED
+# error "Never use <hresetintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _HRESETINTRIN_H_INCLUDED
+#define _HRESETINTRIN_H_INCLUDED
+
+#ifndef __HRESET__
+#pragma GCC push_options
+#pragma GCC target ("hreset")
+#define __DISABLE_HRESET__
+#endif /* __HRESET__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_hreset (unsigned int __EAX)
+{
+  __builtin_ia32_hreset (__EAX);
+}
+
+#ifdef __DISABLE_HRESET__
+#undef __DISABLE_HRESET__
+#pragma GCC pop_options
+#endif /* __DISABLE_HRESET__ */
+#endif /* _HRESETINTRIN_H_INCLUDED.  */
diff --git a/include-gcc/ia32intrin.h b/include-gcc/ia32intrin.h
new file mode 100644 (file)
index 0000000..25b19bd
--- /dev/null
@@ -0,0 +1,317 @@
+/* Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <ia32intrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+/* 32bit bsf */
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bsfd (int __X)
+{
+  return __builtin_ctz (__X);
+}
+
+/* 32bit bsr */
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bsrd (int __X)
+{
+  return __builtin_ia32_bsrsi (__X);
+}
+
+/* 32bit bswap */
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bswapd (int __X)
+{
+  return __builtin_bswap32 (__X);
+}
+
+#ifndef __iamcu__
+
+#ifndef __CRC32__
+#pragma GCC push_options
+#pragma GCC target("crc32")
+#define __DISABLE_CRC32__
+#endif /* __CRC32__ */
+
+/* 32bit accumulate CRC32 (polynomial 0x11EDC6F41) value.  */
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__crc32b (unsigned int __C, unsigned char __V)
+{
+  return __builtin_ia32_crc32qi (__C, __V);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__crc32w (unsigned int __C, unsigned short __V)
+{
+  return __builtin_ia32_crc32hi (__C, __V);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__crc32d (unsigned int __C, unsigned int __V)
+{
+  return __builtin_ia32_crc32si (__C, __V);
+}
+
+#ifdef __DISABLE_CRC32__
+#undef __DISABLE_CRC32__
+#pragma GCC pop_options
+#endif /* __DISABLE_CRC32__ */
+
+#endif /* __iamcu__ */
+
+/* 32bit popcnt */
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__popcntd (unsigned int __X)
+{
+  return __builtin_popcount (__X);
+}
+
+#ifndef __iamcu__
+
+/* rdpmc */
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rdpmc (int __S)
+{
+  return __builtin_ia32_rdpmc (__S);
+}
+
+#endif /* __iamcu__ */
+
+/* rdtsc */
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rdtsc (void)
+{
+  return __builtin_ia32_rdtsc ();
+}
+
+#ifndef __iamcu__
+
+/* rdtscp */
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rdtscp (unsigned int *__A)
+{
+  return __builtin_ia32_rdtscp (__A);
+}
+
+#endif /* __iamcu__ */
+
+/* 8bit rol */
+extern __inline unsigned char
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rolb (unsigned char __X, int __C)
+{
+  return __builtin_ia32_rolqi (__X, __C);
+}
+
+/* 16bit rol */
+extern __inline unsigned short
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rolw (unsigned short __X, int __C)
+{
+  return __builtin_ia32_rolhi (__X, __C);
+}
+
+/* 32bit rol */
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rold (unsigned int __X, int __C)
+{
+  __C &= 31;
+  return (__X << __C) | (__X >> (-__C & 31));
+}
+
+/* 8bit ror */
+extern __inline unsigned char
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rorb (unsigned char __X, int __C)
+{
+  return __builtin_ia32_rorqi (__X, __C);
+}
+
+/* 16bit ror */
+extern __inline unsigned short
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rorw (unsigned short __X, int __C)
+{
+  return __builtin_ia32_rorhi (__X, __C);
+}
+
+/* 32bit ror */
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rord (unsigned int __X, int __C)
+{
+  __C &= 31;
+  return (__X >> __C) | (__X << (-__C & 31));
+}
+
+/* Pause */
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__pause (void)
+{
+  __builtin_ia32_pause ();
+}
+
+#ifdef __x86_64__
+/* 64bit bsf */
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bsfq (long long __X)
+{
+  return __builtin_ctzll (__X);
+}
+
+/* 64bit bsr */
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bsrq (long long __X)
+{
+  return __builtin_ia32_bsrdi (__X);
+}
+
+/* 64bit bswap */
+extern __inline long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bswapq (long long __X)
+{
+  return __builtin_bswap64 (__X);
+}
+
+#ifndef __CRC32__
+#pragma GCC push_options
+#pragma GCC target("crc32")
+#define __DISABLE_CRC32__
+#endif /* __CRC32__ */
+
+/* 64bit accumulate CRC32 (polynomial 0x11EDC6F41) value.  */
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__crc32q (unsigned long long __C, unsigned long long __V)
+{
+  return __builtin_ia32_crc32di (__C, __V);
+}
+
+#ifdef __DISABLE_CRC32__
+#undef __DISABLE_CRC32__
+#pragma GCC pop_options
+#endif /* __DISABLE_CRC32__ */
+
+/* 64bit popcnt */
+extern __inline long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__popcntq (unsigned long long __X)
+{
+  return __builtin_popcountll (__X);
+}
+
+/* 64bit rol */
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rolq (unsigned long long __X, int __C)
+{
+  __C &= 63;
+  return (__X << __C) | (__X >> (-__C & 63));
+}
+
+/* 64bit ror */
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__rorq (unsigned long long __X, int __C)
+{
+  __C &= 63;
+  return (__X >> __C) | (__X << (-__C & 63));
+}
+
+/* Read flags register */
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__readeflags (void)
+{
+  return __builtin_ia32_readeflags_u64 ();
+}
+
+/* Write flags register */
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__writeeflags (unsigned long long __X)
+{
+  __builtin_ia32_writeeflags_u64 (__X);
+}
+
+#define _bswap64(a)            __bswapq(a)
+#define _popcnt64(a)           __popcntq(a)
+#else
+
+/* Read flags register */
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__readeflags (void)
+{
+  return __builtin_ia32_readeflags_u32 ();
+}
+
+/* Write flags register */
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__writeeflags (unsigned int __X)
+{
+  __builtin_ia32_writeeflags_u32 (__X);
+}
+
+#endif
+
+/* On LP64 systems, longs are 64-bit.  Use the appropriate rotate
+ * function.  */
+#ifdef __LP64__
+#define _lrotl(a,b)            __rolq((a), (b))
+#define _lrotr(a,b)            __rorq((a), (b))
+#else
+#define _lrotl(a,b)            __rold((a), (b))
+#define _lrotr(a,b)            __rord((a), (b))
+#endif
+
+#define _bit_scan_forward(a)   __bsfd(a)
+#define _bit_scan_reverse(a)   __bsrd(a)
+#define _bswap(a)              __bswapd(a)
+#define _popcnt32(a)           __popcntd(a)
+#ifndef __iamcu__
+#define _rdpmc(a)              __rdpmc(a)
+#define _rdtscp(a)             __rdtscp(a)
+#endif /* __iamcu__ */
+#define _rdtsc()               __rdtsc()
+#define _rotwl(a,b)            __rolw((a), (b))
+#define _rotwr(a,b)            __rorw((a), (b))
+#define _rotl(a,b)             __rold((a), (b))
+#define _rotr(a,b)             __rord((a), (b))
diff --git a/include-gcc/immintrin.h b/include-gcc/immintrin.h
new file mode 100644 (file)
index 0000000..b220d87
--- /dev/null
@@ -0,0 +1,143 @@
+/* Copyright (C) 2008-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#define _IMMINTRIN_H_INCLUDED
+
+#include <x86gprintrin.h>
+
+#include <mmintrin.h>
+
+#include <xmmintrin.h>
+
+#include <emmintrin.h>
+
+#include <pmmintrin.h>
+
+#include <tmmintrin.h>
+
+#include <smmintrin.h>
+
+#include <wmmintrin.h>
+
+#include <avxintrin.h>
+
+#include <avxvnniintrin.h>
+
+#include <avxifmaintrin.h>
+
+#include <avxvnniint8intrin.h>
+
+#include <avx2intrin.h>
+
+#include <avx512fintrin.h>
+
+#include <avx512erintrin.h>
+
+#include <avx512pfintrin.h>
+
+#include <avx512cdintrin.h>
+
+#include <avx512vlintrin.h>
+
+#include <avx512bwintrin.h>
+
+#include <avx512dqintrin.h>
+
+#include <avx512vlbwintrin.h>
+
+#include <avx512vldqintrin.h>
+
+#include <avx512ifmaintrin.h>
+
+#include <avx512ifmavlintrin.h>
+
+#include <avx512vbmiintrin.h>
+
+#include <avx512vbmivlintrin.h>
+
+#include <avx5124fmapsintrin.h>
+
+#include <avx5124vnniwintrin.h>
+
+#include <avx512vpopcntdqintrin.h>
+
+#include <avx512vbmi2intrin.h>
+
+#include <avx512vbmi2vlintrin.h>
+
+#include <avx512vnniintrin.h>
+
+#include <avx512vnnivlintrin.h>
+
+#include <avx512vpopcntdqvlintrin.h>
+
+#include <avx512bitalgintrin.h>
+
+#include <avx512vp2intersectintrin.h>
+
+#include <avx512vp2intersectvlintrin.h>
+
+#ifdef __SSE2__
+#include <avx512fp16intrin.h>
+
+#include <avx512fp16vlintrin.h>
+#endif
+
+#include <shaintrin.h>
+
+#include <fmaintrin.h>
+
+#include <f16cintrin.h>
+
+#include <rtmintrin.h>
+
+#include <gfniintrin.h>
+
+#include <vaesintrin.h>
+
+#include <vpclmulqdqintrin.h>
+
+#ifdef __SSE2__
+#include <avx512bf16vlintrin.h>
+
+#include <avx512bf16intrin.h>
+
+#include <avxneconvertintrin.h>
+#endif
+
+#include <amxtileintrin.h>
+
+#include <amxint8intrin.h>
+
+#include <amxbf16intrin.h>
+
+#include <amxcomplexintrin.h>
+
+#include <prfchwintrin.h>
+
+#include <keylockerintrin.h>
+
+#include <amxfp16intrin.h>
+
+#endif /* _IMMINTRIN_H_INCLUDED */
diff --git a/include-gcc/keylockerintrin.h b/include-gcc/keylockerintrin.h
new file mode 100644 (file)
index 0000000..09c4712
--- /dev/null
@@ -0,0 +1,129 @@
+/* Copyright (C) 2018-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if !defined _IMMINTRIN_H_INCLUDED
+# error "Never use <keylockerintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _KEYLOCKERINTRIN_H_INCLUDED
+#define _KEYLOCKERINTRIN_H_INCLUDED
+
+#ifndef __KL__
+#pragma GCC push_options
+#pragma GCC target("kl")
+#define __DISABLE_KL__
+#endif /* __KL__ */
+
+
+extern __inline
+void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadiwkey (unsigned int __I, __m128i __A, __m128i __B, __m128i __C)
+{
+  __builtin_ia32_loadiwkey ((__v2di) __B, (__v2di) __C, (__v2di) __A, __I);
+}
+
+extern __inline
+unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_encodekey128_u32 (unsigned int __I, __m128i __A, void * __P)
+{
+  return __builtin_ia32_encodekey128_u32 (__I, (__v2di)__A, __P);
+}
+
+extern __inline
+unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_encodekey256_u32 (unsigned int __I, __m128i __A, __m128i __B, void * __P)
+{
+  return __builtin_ia32_encodekey256_u32 (__I, (__v2di)__A, (__v2di)__B, __P);
+}
+
+extern __inline
+unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesdec128kl_u8 (__m128i * __A, __m128i __B, const void * __P)
+{
+  return __builtin_ia32_aesdec128kl_u8 ((__v2di *) __A, (__v2di) __B, __P);
+}
+
+extern __inline
+unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesdec256kl_u8 (__m128i * __A, __m128i __B, const void * __P)
+{
+  return __builtin_ia32_aesdec256kl_u8 ((__v2di *) __A, (__v2di) __B, __P);
+}
+
+extern __inline
+unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesenc128kl_u8 (__m128i * __A, __m128i __B, const void * __P)
+{
+  return __builtin_ia32_aesenc128kl_u8 ((__v2di *) __A, (__v2di) __B, __P);
+}
+
+extern __inline
+unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesenc256kl_u8 (__m128i * __A, __m128i __B, const void * __P)
+{
+  return __builtin_ia32_aesenc256kl_u8 ((__v2di *) __A, (__v2di) __B, __P);
+}
+
+#ifdef __DISABLE_KL__
+#undef __DISABLE_KL__
+#pragma GCC pop_options
+#endif /* __DISABLE_KL__ */
+
+#ifndef __WIDEKL__
+#pragma GCC push_options
+#pragma GCC target("widekl")
+#define __DISABLE_WIDEKL__
+#endif /* __WIDEKL__ */
+
+extern __inline
+unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesdecwide128kl_u8(__m128i __A[8], const __m128i __B[8], const void * __P)
+{
+  return __builtin_ia32_aesdecwide128kl_u8 ((__v2di *) __A, (__v2di *) __B, __P);
+}
+
+extern __inline
+unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesdecwide256kl_u8(__m128i __A[8], const __m128i __B[8], const void * __P)
+{
+  return __builtin_ia32_aesdecwide256kl_u8 ((__v2di *) __A, (__v2di *) __B, __P);
+}
+
+extern __inline
+unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesencwide128kl_u8(__m128i __A[8], const __m128i __B[8], const void * __P)
+{
+  return __builtin_ia32_aesencwide128kl_u8 ((__v2di *) __A, (__v2di *) __B, __P);
+}
+
+extern __inline
+unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesencwide256kl_u8(__m128i __A[8], const __m128i __B[8], const void * __P)
+{
+  return __builtin_ia32_aesencwide256kl_u8 ((__v2di *) __A, (__v2di *) __B, __P);
+}
+#ifdef __DISABLE_WIDEKL__
+#undef __DISABLE_WIDEKL__
+#pragma GCC pop_options
+#endif /* __DISABLE_WIDEKL__ */
+#endif /* _KEYLOCKERINTRIN_H_INCLUDED */
diff --git a/include-gcc/lwpintrin.h b/include-gcc/lwpintrin.h
new file mode 100644 (file)
index 0000000..b3e9e1d
--- /dev/null
@@ -0,0 +1,107 @@
+/* Copyright (C) 2007-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <lwpintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _LWPINTRIN_H_INCLUDED
+#define _LWPINTRIN_H_INCLUDED
+
+#ifndef __LWP__
+#pragma GCC push_options
+#pragma GCC target("lwp")
+#define __DISABLE_LWP__
+#endif /* __LWP__ */
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__llwpcb (void *__pcbAddress)
+{
+  __builtin_ia32_llwpcb (__pcbAddress);
+}
+
+extern __inline void * __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__slwpcb (void)
+{
+  return __builtin_ia32_slwpcb ();
+}
+
+#ifdef __OPTIMIZE__
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__lwpval32 (unsigned int __data2, unsigned int __data1, unsigned int __flags)
+{
+  __builtin_ia32_lwpval32 (__data2, __data1, __flags);
+}
+
+#ifdef __x86_64__
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__lwpval64 (unsigned long long __data2, unsigned int __data1,
+           unsigned int __flags)
+{
+  __builtin_ia32_lwpval64 (__data2, __data1, __flags);
+}
+#endif
+#else
+#define __lwpval32(D2, D1, F) \
+  (__builtin_ia32_lwpval32 ((unsigned int) (D2), (unsigned int) (D1), \
+                           (unsigned int) (F)))
+#ifdef __x86_64__
+#define __lwpval64(D2, D1, F) \
+  (__builtin_ia32_lwpval64 ((unsigned long long) (D2), (unsigned int) (D1), \
+                           (unsigned int) (F)))
+#endif
+#endif
+
+
+#ifdef __OPTIMIZE__
+extern __inline unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__lwpins32 (unsigned int __data2, unsigned int __data1, unsigned int __flags)
+{
+  return __builtin_ia32_lwpins32 (__data2, __data1, __flags);
+}
+
+#ifdef __x86_64__
+extern __inline unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__lwpins64 (unsigned long long __data2, unsigned int __data1,
+           unsigned int __flags)
+{
+  return __builtin_ia32_lwpins64 (__data2, __data1, __flags);
+}
+#endif
+#else
+#define __lwpins32(D2, D1, F) \
+  (__builtin_ia32_lwpins32 ((unsigned int) (D2), (unsigned int) (D1), \
+                           (unsigned int) (F)))
+#ifdef __x86_64__
+#define __lwpins64(D2, D1, F) \
+  (__builtin_ia32_lwpins64 ((unsigned long long) (D2), (unsigned int) (D1), \
+                           (unsigned int) (F)))
+#endif
+#endif
+
+#ifdef __DISABLE_LWP__
+#undef __DISABLE_LWP__
+#pragma GCC pop_options
+#endif /* __DISABLE_LWP__ */
+
+#endif /* _LWPINTRIN_H_INCLUDED */
diff --git a/include-gcc/lzcntintrin.h b/include-gcc/lzcntintrin.h
new file mode 100644 (file)
index 0000000..4d81985
--- /dev/null
@@ -0,0 +1,75 @@
+/* Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <lzcntintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+
+#ifndef _LZCNTINTRIN_H_INCLUDED
+#define _LZCNTINTRIN_H_INCLUDED
+
+#ifndef __LZCNT__
+#pragma GCC push_options
+#pragma GCC target("lzcnt")
+#define __DISABLE_LZCNT__
+#endif /* __LZCNT__ */
+
+extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__lzcnt16 (unsigned short __X)
+{
+  return __builtin_ia32_lzcnt_u16 (__X);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__lzcnt32 (unsigned int __X)
+{
+  return __builtin_ia32_lzcnt_u32 (__X);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lzcnt_u32 (unsigned int __X)
+{
+  return __builtin_ia32_lzcnt_u32 (__X);
+}
+
+#ifdef __x86_64__
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__lzcnt64 (unsigned long long __X)
+{
+  return __builtin_ia32_lzcnt_u64 (__X);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_lzcnt_u64 (unsigned long long __X)
+{
+  return __builtin_ia32_lzcnt_u64 (__X);
+}
+#endif
+
+#ifdef __DISABLE_LZCNT__
+#undef __DISABLE_LZCNT__
+#pragma GCC pop_options
+#endif /* __DISABLE_LZCNT__ */
+
+#endif /* _LZCNTINTRIN_H_INCLUDED */
diff --git a/include-gcc/mm3dnow.h b/include-gcc/mm3dnow.h
new file mode 100644 (file)
index 0000000..f8ef374
--- /dev/null
@@ -0,0 +1,233 @@
+/* Copyright (C) 2004-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Implemented from the mm3dnow.h (of supposedly AMD origin) included with
+   MSVC 7.1.  */
+
+#ifndef _MM3DNOW_H_INCLUDED
+#define _MM3DNOW_H_INCLUDED
+
+#include <mmintrin.h>
+#include <prfchwintrin.h>
+
+#if defined __x86_64__ && !defined __SSE__ || !defined __3dNOW__
+#pragma GCC push_options
+#ifdef __x86_64__
+#pragma GCC target("sse,3dnow")
+#else
+#pragma GCC target("3dnow")
+#endif
+#define __DISABLE_3dNOW__
+#endif /* __3dNOW__ */
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_femms (void)
+{
+  __builtin_ia32_femms();
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pavgusb (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pavgusb ((__v8qi)__A, (__v8qi)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pf2id (__m64 __A)
+{
+  return (__m64)__builtin_ia32_pf2id ((__v2sf)__A);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfacc (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pfacc ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfadd (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pfadd ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfcmpeq (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pfcmpeq ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfcmpge (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pfcmpge ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfcmpgt (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pfcmpgt ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfmax (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pfmax ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfmin (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pfmin ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfmul (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pfmul ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfrcp (__m64 __A)
+{
+  return (__m64)__builtin_ia32_pfrcp ((__v2sf)__A);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfrcpit1 (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pfrcpit1 ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfrcpit2 (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pfrcpit2 ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfrsqrt (__m64 __A)
+{
+  return (__m64)__builtin_ia32_pfrsqrt ((__v2sf)__A);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfrsqit1 (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pfrsqit1 ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfsub (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pfsub ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfsubr (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pfsubr ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pi2fd (__m64 __A)
+{
+  return (__m64)__builtin_ia32_pi2fd ((__v2si)__A);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmulhrw (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pmulhrw ((__v4hi)__A, (__v4hi)__B);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_prefetch (void *__P)
+{
+  __builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_from_float (float __A)
+{
+  return __extension__ (__m64)(__v2sf){ __A, 0.0f };
+}
+
+extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_to_float (__m64 __A)
+{
+  union { __v2sf v; float a[2]; } __tmp;
+  __tmp.v = (__v2sf)__A;
+  return __tmp.a[0];
+}
+
+#ifdef __DISABLE_3dNOW__
+#undef __DISABLE_3dNOW__
+#pragma GCC pop_options
+#endif /* __DISABLE_3dNOW__ */
+
+#if defined __x86_64__ && !defined __SSE__ || !defined __3dNOW_A__
+#pragma GCC push_options
+#ifdef __x86_64__
+#pragma GCC target("sse,3dnowa")
+#else
+#pragma GCC target("3dnowa")
+#endif
+#define __DISABLE_3dNOW_A__
+#endif /* __3dNOW_A__ */
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pf2iw (__m64 __A)
+{
+  return (__m64)__builtin_ia32_pf2iw ((__v2sf)__A);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfnacc (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pfnacc ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pfpnacc (__m64 __A, __m64 __B)
+{
+  return (__m64)__builtin_ia32_pfpnacc ((__v2sf)__A, (__v2sf)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pi2fw (__m64 __A)
+{
+  return (__m64)__builtin_ia32_pi2fw ((__v2si)__A);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pswapd (__m64 __A)
+{
+  return (__m64)__builtin_ia32_pswapdsf ((__v2sf)__A);
+}
+
+#ifdef __DISABLE_3dNOW_A__
+#undef __DISABLE_3dNOW_A__
+#pragma GCC pop_options
+#endif /* __DISABLE_3dNOW_A__ */
+
+#endif /* _MM3DNOW_H_INCLUDED */
diff --git a/include-gcc/mm_malloc.h b/include-gcc/mm_malloc.h
new file mode 100644 (file)
index 0000000..3527283
--- /dev/null
@@ -0,0 +1,57 @@
+/* Copyright (C) 2004-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _MM_MALLOC_H_INCLUDED
+#define _MM_MALLOC_H_INCLUDED
+
+#include <stdlib.h>
+
+/* We can't depend on <stdlib.h> since the prototype of posix_memalign
+   may not be visible.  */
+#ifndef __cplusplus
+extern int posix_memalign (void **, size_t, size_t);
+#else
+extern "C" int posix_memalign (void **, size_t, size_t) throw ();
+#endif
+
+static __inline void *
+_mm_malloc (size_t __size, size_t __alignment)
+{
+  void *__ptr;
+  if (__alignment == 1)
+    return malloc (__size);
+  if (__alignment == 2 || (sizeof (void *) == 8 && __alignment == 4))
+    __alignment = sizeof (void *);
+  if (posix_memalign (&__ptr, __alignment, __size) == 0)
+    return __ptr;
+  else
+    return NULL;
+}
+
+static __inline void
+_mm_free (void *__ptr)
+{
+  free (__ptr);
+}
+
+#endif /* _MM_MALLOC_H_INCLUDED */
diff --git a/include-gcc/mmintrin.h b/include-gcc/mmintrin.h
new file mode 100644 (file)
index 0000000..fbac9c3
--- /dev/null
@@ -0,0 +1,965 @@
+/* Copyright (C) 2002-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+   User Guide and Reference, version 9.0.  */
+
+#ifndef _MMINTRIN_H_INCLUDED
+#define _MMINTRIN_H_INCLUDED
+
+#if defined __x86_64__ && !defined __SSE__ || !defined __MMX__
+#pragma GCC push_options
+#ifdef __MMX_WITH_SSE__
+#pragma GCC target("sse2")
+#elif defined __x86_64__
+#pragma GCC target("sse,mmx")
+#else
+#pragma GCC target("mmx")
+#endif
+#define __DISABLE_MMX__
+#endif /* __MMX__ */
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+   vector types, and their scalar components.  */
+typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
+typedef int __m32 __attribute__ ((__vector_size__ (4), __may_alias__));
+typedef short __m16 __attribute__ ((__vector_size__ (2), __may_alias__));
+
+/* Unaligned version of the same type  */
+typedef int __m64_u __attribute__ ((__vector_size__ (8), __may_alias__, __aligned__ (1)));
+typedef int __m32_u __attribute__ ((__vector_size__ (4), \
+                                   __may_alias__, __aligned__ (1)));
+typedef short __m16_u __attribute__ ((__vector_size__ (2), \
+                                     __may_alias__, __aligned__ (1)));
+
+/* Internal data types for implementing the intrinsics.  */
+typedef int __v2si __attribute__ ((__vector_size__ (8)));
+typedef short __v4hi __attribute__ ((__vector_size__ (8)));
+typedef char __v8qi __attribute__ ((__vector_size__ (8)));
+typedef long long __v1di __attribute__ ((__vector_size__ (8)));
+typedef float __v2sf __attribute__ ((__vector_size__ (8)));
+
+/* Empty the multimedia state.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_empty (void)
+{
+  __builtin_ia32_emms ();
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_empty (void)
+{
+  _mm_empty ();
+}
+
+/* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
+extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi32_si64 (int __i)
+{
+  return (__m64) __builtin_ia32_vec_init_v2si (__i, 0);
+}
+
+extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_from_int (int __i)
+{
+  return _mm_cvtsi32_si64 (__i);
+}
+
+#ifdef __x86_64__
+/* Convert I to a __m64 object.  */
+
+/* Intel intrinsic.  */
+extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_from_int64 (long long __i)
+{
+  return (__m64) __i;
+}
+
+extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64_m64 (long long __i)
+{
+  return (__m64) __i;
+}
+
+/* Microsoft intrinsic.  */
+extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64x_si64 (long long __i)
+{
+  return (__m64) __i;
+}
+
+extern __inline __m64  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pi64x (long long __i)
+{
+  return (__m64) __i;
+}
+#endif
+
+/* Convert the lower 32 bits of the __m64 object into an integer.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64_si32 (__m64 __i)
+{
+  return __builtin_ia32_vec_ext_v2si ((__v2si)__i, 0);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_to_int (__m64 __i)
+{
+  return _mm_cvtsi64_si32 (__i);
+}
+
+#ifdef __x86_64__
+/* Convert the __m64 object to a 64bit integer.  */
+
+/* Intel intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_to_int64 (__m64 __i)
+{
+  return (long long)__i;
+}
+
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtm64_si64 (__m64 __i)
+{
+  return (long long)__i;
+}
+
+/* Microsoft intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64_si64x (__m64 __i)
+{
+  return (long long)__i;
+}
+#endif
+
+/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
+   the result, and the four 16-bit values from M2 into the upper four 8-bit
+   values of the result, all with signed saturation.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packs_pi16 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_packsswb ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_packsswb (__m64 __m1, __m64 __m2)
+{
+  return _mm_packs_pi16 (__m1, __m2);
+}
+
+/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
+   the result, and the two 32-bit values from M2 into the upper two 16-bit
+   values of the result, all with signed saturation.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packs_pi32 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_packssdw ((__v2si)__m1, (__v2si)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_packssdw (__m64 __m1, __m64 __m2)
+{
+  return _mm_packs_pi32 (__m1, __m2);
+}
+
+/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
+   the result, and the four 16-bit values from M2 into the upper four 8-bit
+   values of the result, all with unsigned saturation.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packs_pu16 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_packuswb ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_packuswb (__m64 __m1, __m64 __m2)
+{
+  return _mm_packs_pu16 (__m1, __m2);
+}
+
+/* Interleave the four 8-bit values from the high half of M1 with the four
+   8-bit values from the high half of M2.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_punpckhbw ((__v8qi)__m1, (__v8qi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_punpckhbw (__m64 __m1, __m64 __m2)
+{
+  return _mm_unpackhi_pi8 (__m1, __m2);
+}
+
+/* Interleave the two 16-bit values from the high half of M1 with the two
+   16-bit values from the high half of M2.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_pi16 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_punpckhwd ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_punpckhwd (__m64 __m1, __m64 __m2)
+{
+  return _mm_unpackhi_pi16 (__m1, __m2);
+}
+
+/* Interleave the 32-bit value from the high half of M1 with the 32-bit
+   value from the high half of M2.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_pi32 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_punpckhdq ((__v2si)__m1, (__v2si)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_punpckhdq (__m64 __m1, __m64 __m2)
+{
+  return _mm_unpackhi_pi32 (__m1, __m2);
+}
+
+/* Interleave the four 8-bit values from the low half of M1 with the four
+   8-bit values from the low half of M2.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_punpcklbw ((__v8qi)__m1, (__v8qi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_punpcklbw (__m64 __m1, __m64 __m2)
+{
+  return _mm_unpacklo_pi8 (__m1, __m2);
+}
+
+/* Interleave the two 16-bit values from the low half of M1 with the two
+   16-bit values from the low half of M2.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_pi16 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_punpcklwd ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_punpcklwd (__m64 __m1, __m64 __m2)
+{
+  return _mm_unpacklo_pi16 (__m1, __m2);
+}
+
+/* Interleave the 32-bit value from the low half of M1 with the 32-bit
+   value from the low half of M2.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_pi32 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_punpckldq ((__v2si)__m1, (__v2si)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_punpckldq (__m64 __m1, __m64 __m2)
+{
+  return _mm_unpacklo_pi32 (__m1, __m2);
+}
+
+/* Add the 8-bit values in M1 to the 8-bit values in M2.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_pi8 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_paddb ((__v8qi)__m1, (__v8qi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_paddb (__m64 __m1, __m64 __m2)
+{
+  return _mm_add_pi8 (__m1, __m2);
+}
+
+/* Add the 16-bit values in M1 to the 16-bit values in M2.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_pi16 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_paddw ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_paddw (__m64 __m1, __m64 __m2)
+{
+  return _mm_add_pi16 (__m1, __m2);
+}
+
+/* Add the 32-bit values in M1 to the 32-bit values in M2.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_pi32 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_paddd ((__v2si)__m1, (__v2si)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_paddd (__m64 __m1, __m64 __m2)
+{
+  return _mm_add_pi32 (__m1, __m2);
+}
+
+/* Add the 64-bit values in M1 to the 64-bit values in M2.  */
+#ifndef __SSE2__
+#pragma GCC push_options
+#ifdef __MMX_WITH_SSE__
+#pragma GCC target("sse2")
+#else
+#pragma GCC target("sse2,mmx")
+#endif
+#define __DISABLE_SSE2__
+#endif /* __SSE2__ */
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_si64 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_paddq ((__v1di)__m1, (__v1di)__m2);
+}
+#ifdef __DISABLE_SSE2__
+#undef __DISABLE_SSE2__
+#pragma GCC pop_options
+#endif /* __DISABLE_SSE2__ */
+
+/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
+   saturated arithmetic.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_pi8 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_paddsb ((__v8qi)__m1, (__v8qi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_paddsb (__m64 __m1, __m64 __m2)
+{
+  return _mm_adds_pi8 (__m1, __m2);
+}
+
+/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
+   saturated arithmetic.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_pi16 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_paddsw ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_paddsw (__m64 __m1, __m64 __m2)
+{
+  return _mm_adds_pi16 (__m1, __m2);
+}
+
+/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
+   saturated arithmetic.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_pu8 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_paddusb ((__v8qi)__m1, (__v8qi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_paddusb (__m64 __m1, __m64 __m2)
+{
+  return _mm_adds_pu8 (__m1, __m2);
+}
+
+/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
+   saturated arithmetic.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_adds_pu16 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_paddusw ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_paddusw (__m64 __m1, __m64 __m2)
+{
+  return _mm_adds_pu16 (__m1, __m2);
+}
+
+/* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_pi8 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_psubb ((__v8qi)__m1, (__v8qi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psubb (__m64 __m1, __m64 __m2)
+{
+  return _mm_sub_pi8 (__m1, __m2);
+}
+
+/* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_pi16 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_psubw ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psubw (__m64 __m1, __m64 __m2)
+{
+  return _mm_sub_pi16 (__m1, __m2);
+}
+
+/* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_pi32 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_psubd ((__v2si)__m1, (__v2si)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psubd (__m64 __m1, __m64 __m2)
+{
+  return _mm_sub_pi32 (__m1, __m2);
+}
+
+/* Add the 64-bit values in M1 to the 64-bit values in M2.  */
+#ifndef __SSE2__
+#pragma GCC push_options
+#ifdef __MMX_WITH_SSE__
+#pragma GCC target("sse2")
+#else
+#pragma GCC target("sse2,mmx")
+#endif
+#define __DISABLE_SSE2__
+#endif /* __SSE2__ */
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_si64 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_psubq ((__v1di)__m1, (__v1di)__m2);
+}
+#ifdef __DISABLE_SSE2__
+#undef __DISABLE_SSE2__
+#pragma GCC pop_options
+#endif /* __DISABLE_SSE2__ */
+
+/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
+   saturating arithmetic.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_subs_pi8 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_psubsb ((__v8qi)__m1, (__v8qi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psubsb (__m64 __m1, __m64 __m2)
+{
+  return _mm_subs_pi8 (__m1, __m2);
+}
+
+/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
+   signed saturating arithmetic.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_subs_pi16 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_psubsw ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psubsw (__m64 __m1, __m64 __m2)
+{
+  return _mm_subs_pi16 (__m1, __m2);
+}
+
+/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
+   unsigned saturating arithmetic.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_subs_pu8 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_psubusb ((__v8qi)__m1, (__v8qi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psubusb (__m64 __m1, __m64 __m2)
+{
+  return _mm_subs_pu8 (__m1, __m2);
+}
+
+/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
+   unsigned saturating arithmetic.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_subs_pu16 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_psubusw ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psubusw (__m64 __m1, __m64 __m2)
+{
+  return _mm_subs_pu16 (__m1, __m2);
+}
+
+/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
+   four 32-bit intermediate results, which are then summed by pairs to
+   produce two 32-bit results.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_madd_pi16 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_pmaddwd ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmaddwd (__m64 __m1, __m64 __m2)
+{
+  return _mm_madd_pi16 (__m1, __m2);
+}
+
+/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
+   M2 and produce the high 16 bits of the 32-bit results.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhi_pi16 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_pmulhw ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmulhw (__m64 __m1, __m64 __m2)
+{
+  return _mm_mulhi_pi16 (__m1, __m2);
+}
+
+/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
+   the low 16 bits of the results.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mullo_pi16 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_pmullw ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmullw (__m64 __m1, __m64 __m2)
+{
+  return _mm_mullo_pi16 (__m1, __m2);
+}
+
+/* Shift four 16-bit values in M left by COUNT.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_pi16 (__m64 __m, __m64 __count)
+{
+  return (__m64) __builtin_ia32_psllw ((__v4hi)__m, (__v4hi)__count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psllw (__m64 __m, __m64 __count)
+{
+  return _mm_sll_pi16 (__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_pi16 (__m64 __m, int __count)
+{
+  return (__m64) __builtin_ia32_psllwi ((__v4hi)__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psllwi (__m64 __m, int __count)
+{
+  return _mm_slli_pi16 (__m, __count);
+}
+
+/* Shift two 32-bit values in M left by COUNT.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_pi32 (__m64 __m, __m64 __count)
+{
+  return (__m64) __builtin_ia32_pslld ((__v2si)__m, (__v2si)__count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pslld (__m64 __m, __m64 __count)
+{
+  return _mm_sll_pi32 (__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_pi32 (__m64 __m, int __count)
+{
+  return (__m64) __builtin_ia32_pslldi ((__v2si)__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pslldi (__m64 __m, int __count)
+{
+  return _mm_slli_pi32 (__m, __count);
+}
+
+/* Shift the 64-bit value in M left by COUNT.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sll_si64 (__m64 __m, __m64 __count)
+{
+  return (__m64) __builtin_ia32_psllq ((__v1di)__m, (__v1di)__count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psllq (__m64 __m, __m64 __count)
+{
+  return _mm_sll_si64 (__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_slli_si64 (__m64 __m, int __count)
+{
+  return (__m64) __builtin_ia32_psllqi ((__v1di)__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psllqi (__m64 __m, int __count)
+{
+  return _mm_slli_si64 (__m, __count);
+}
+
+/* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sra_pi16 (__m64 __m, __m64 __count)
+{
+  return (__m64) __builtin_ia32_psraw ((__v4hi)__m, (__v4hi)__count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psraw (__m64 __m, __m64 __count)
+{
+  return _mm_sra_pi16 (__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srai_pi16 (__m64 __m, int __count)
+{
+  return (__m64) __builtin_ia32_psrawi ((__v4hi)__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psrawi (__m64 __m, int __count)
+{
+  return _mm_srai_pi16 (__m, __count);
+}
+
+/* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sra_pi32 (__m64 __m, __m64 __count)
+{
+  return (__m64) __builtin_ia32_psrad ((__v2si)__m, (__v2si)__count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psrad (__m64 __m, __m64 __count)
+{
+  return _mm_sra_pi32 (__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srai_pi32 (__m64 __m, int __count)
+{
+  return (__m64) __builtin_ia32_psradi ((__v2si)__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psradi (__m64 __m, int __count)
+{
+  return _mm_srai_pi32 (__m, __count);
+}
+
+/* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_pi16 (__m64 __m, __m64 __count)
+{
+  return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, (__v4hi)__count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psrlw (__m64 __m, __m64 __count)
+{
+  return _mm_srl_pi16 (__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_pi16 (__m64 __m, int __count)
+{
+  return (__m64) __builtin_ia32_psrlwi ((__v4hi)__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psrlwi (__m64 __m, int __count)
+{
+  return _mm_srli_pi16 (__m, __count);
+}
+
+/* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_pi32 (__m64 __m, __m64 __count)
+{
+  return (__m64) __builtin_ia32_psrld ((__v2si)__m, (__v2si)__count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psrld (__m64 __m, __m64 __count)
+{
+  return _mm_srl_pi32 (__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_pi32 (__m64 __m, int __count)
+{
+  return (__m64) __builtin_ia32_psrldi ((__v2si)__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psrldi (__m64 __m, int __count)
+{
+  return _mm_srli_pi32 (__m, __count);
+}
+
+/* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srl_si64 (__m64 __m, __m64 __count)
+{
+  return (__m64) __builtin_ia32_psrlq ((__v1di)__m, (__v1di)__count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psrlq (__m64 __m, __m64 __count)
+{
+  return _mm_srl_si64 (__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_srli_si64 (__m64 __m, int __count)
+{
+  return (__m64) __builtin_ia32_psrlqi ((__v1di)__m, __count);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psrlqi (__m64 __m, int __count)
+{
+  return _mm_srli_si64 (__m, __count);
+}
+
+/* Bit-wise AND the 64-bit values in M1 and M2.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_and_si64 (__m64 __m1, __m64 __m2)
+{
+  return __builtin_ia32_pand (__m1, __m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pand (__m64 __m1, __m64 __m2)
+{
+  return _mm_and_si64 (__m1, __m2);
+}
+
+/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
+   64-bit value in M2.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_andnot_si64 (__m64 __m1, __m64 __m2)
+{
+  return __builtin_ia32_pandn (__m1, __m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pandn (__m64 __m1, __m64 __m2)
+{
+  return _mm_andnot_si64 (__m1, __m2);
+}
+
+/* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_or_si64 (__m64 __m1, __m64 __m2)
+{
+  return __builtin_ia32_por (__m1, __m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_por (__m64 __m1, __m64 __m2)
+{
+  return _mm_or_si64 (__m1, __m2);
+}
+
+/* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_xor_si64 (__m64 __m1, __m64 __m2)
+{
+  return __builtin_ia32_pxor (__m1, __m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pxor (__m64 __m1, __m64 __m2)
+{
+  return _mm_xor_si64 (__m1, __m2);
+}
+
+/* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
+   test is true and zero if false.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_pi8 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_pcmpeqb ((__v8qi)__m1, (__v8qi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pcmpeqb (__m64 __m1, __m64 __m2)
+{
+  return _mm_cmpeq_pi8 (__m1, __m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_pi8 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_pcmpgtb ((__v8qi)__m1, (__v8qi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pcmpgtb (__m64 __m1, __m64 __m2)
+{
+  return _mm_cmpgt_pi8 (__m1, __m2);
+}
+
+/* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
+   the test is true and zero if false.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_pi16 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_pcmpeqw ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pcmpeqw (__m64 __m1, __m64 __m2)
+{
+  return _mm_cmpeq_pi16 (__m1, __m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_pi16 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_pcmpgtw ((__v4hi)__m1, (__v4hi)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pcmpgtw (__m64 __m1, __m64 __m2)
+{
+  return _mm_cmpgt_pi16 (__m1, __m2);
+}
+
+/* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
+   the test is true and zero if false.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_pcmpeqd ((__v2si)__m1, (__v2si)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pcmpeqd (__m64 __m1, __m64 __m2)
+{
+  return _mm_cmpeq_pi32 (__m1, __m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_pi32 (__m64 __m1, __m64 __m2)
+{
+  return (__m64) __builtin_ia32_pcmpgtd ((__v2si)__m1, (__v2si)__m2);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pcmpgtd (__m64 __m1, __m64 __m2)
+{
+  return _mm_cmpgt_pi32 (__m1, __m2);
+}
+
+/* Creates a 64-bit zero.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setzero_si64 (void)
+{
+  return (__m64)0LL;
+}
+
+/* Creates a vector of two 32-bit values; I0 is least significant.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pi32 (int __i1, int __i0)
+{
+  return (__m64) __builtin_ia32_vec_init_v2si (__i0, __i1);
+}
+
+/* Creates a vector of four 16-bit values; W0 is least significant.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pi16 (short __w3, short __w2, short __w1, short __w0)
+{
+  return (__m64) __builtin_ia32_vec_init_v4hi (__w0, __w1, __w2, __w3);
+}
+
+/* Creates a vector of eight 8-bit values; B0 is least significant.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_pi8 (char __b7, char __b6, char __b5, char __b4,
+            char __b3, char __b2, char __b1, char __b0)
+{
+  return (__m64) __builtin_ia32_vec_init_v8qi (__b0, __b1, __b2, __b3,
+                                              __b4, __b5, __b6, __b7);
+}
+
+/* Similar, but with the arguments in reverse order.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_pi32 (int __i0, int __i1)
+{
+  return _mm_set_pi32 (__i1, __i0);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3)
+{
+  return _mm_set_pi16 (__w3, __w2, __w1, __w0);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3,
+             char __b4, char __b5, char __b6, char __b7)
+{
+  return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
+}
+
+/* Creates a vector of two 32-bit values, both elements containing I.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_pi32 (int __i)
+{
+  return _mm_set_pi32 (__i, __i);
+}
+
+/* Creates a vector of four 16-bit values, all elements containing W.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_pi16 (short __w)
+{
+  return _mm_set_pi16 (__w, __w, __w, __w);
+}
+
+/* Creates a vector of eight 8-bit values, all elements containing B.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_pi8 (char __b)
+{
+  return _mm_set_pi8 (__b, __b, __b, __b, __b, __b, __b, __b);
+}
+#ifdef __DISABLE_MMX__
+#undef __DISABLE_MMX__
+#pragma GCC pop_options
+#endif /* __DISABLE_MMX__ */
+
+#endif /* _MMINTRIN_H_INCLUDED */
diff --git a/include-gcc/movdirintrin.h b/include-gcc/movdirintrin.h
new file mode 100644 (file)
index 0000000..92b500e
--- /dev/null
@@ -0,0 +1,74 @@
+/* Copyright (C) 2018-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <movdirintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _MOVDIRINTRIN_H_INCLUDED
+#define _MOVDIRINTRIN_H_INCLUDED
+
+#ifndef __MOVDIRI__
+#pragma GCC push_options
+#pragma GCC target ("movdiri")
+#define __DISABLE_MOVDIRI__
+#endif /* __MOVDIRI__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_directstoreu_u32 (void * __P, unsigned int __A)
+{
+  __builtin_ia32_directstoreu_u32 ((unsigned int *)__P, __A);
+}
+#ifdef __x86_64__
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_directstoreu_u64 (void * __P, unsigned long long __A)
+{
+  __builtin_ia32_directstoreu_u64 ((unsigned long long *)__P, __A);
+}
+#endif
+
+#ifdef __DISABLE_MOVDIRI__
+#undef __DISABLE_MOVDIRI__
+#pragma GCC pop_options
+#endif /* __DISABLE_MOVDIRI__ */
+
+#ifndef __MOVDIR64B__
+#pragma GCC push_options
+#pragma GCC target ("movdir64b")
+#define __DISABLE_MOVDIR64B__
+#endif /* __MOVDIR64B__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_movdir64b (void * __P, const void * __Q)
+{
+  __builtin_ia32_movdir64b (__P, __Q);
+}
+
+#ifdef __DISABLE_MOVDIR64B__
+#undef __DISABLE_MOVDIR64B__
+#pragma GCC pop_options
+#endif /* __DISABLE_MOVDIR64B__ */
+#endif /* _MOVDIRINTRIN_H_INCLUDED.  */
diff --git a/include-gcc/mwaitintrin.h b/include-gcc/mwaitintrin.h
new file mode 100644 (file)
index 0000000..9ade96b
--- /dev/null
@@ -0,0 +1,52 @@
+/* Copyright (C) 2021-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _MWAITINTRIN_H_INCLUDED
+#define _MWAITINTRIN_H_INCLUDED
+
+#ifndef __MWAIT__
+#pragma GCC push_options
+#pragma GCC target("mwait")
+#define __DISABLE_MWAIT__
+#endif /* __MWAIT__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_monitor (void const * __P, unsigned int __E, unsigned int __H)
+{
+  __builtin_ia32_monitor (__P, __E, __H);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mwait (unsigned int __E, unsigned int __H)
+{
+  __builtin_ia32_mwait (__E, __H);
+}
+
+#ifdef __DISABLE_MWAIT__
+#undef __DISABLE_MWAIT__
+#pragma GCC pop_options
+#endif /* __DISABLE_MWAIT__ */
+
+#endif /* _MWAITINTRIN_H_INCLUDED */
diff --git a/include-gcc/mwaitxintrin.h b/include-gcc/mwaitxintrin.h
new file mode 100644 (file)
index 0000000..4dc1c9c
--- /dev/null
@@ -0,0 +1,50 @@
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _MWAITXINTRIN_H_INCLUDED
+#define _MWAITXINTRIN_H_INCLUDED
+
+#ifndef __MWAITX__
+#pragma GCC push_options
+#pragma GCC target("mwaitx")
+#define __DISABLE_MWAITX__
+#endif /* __MWAITX__ */
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_monitorx (void const * __P, unsigned int __E, unsigned int __H)
+{
+  __builtin_ia32_monitorx (__P, __E, __H);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mwaitx (unsigned int __E, unsigned int __H, unsigned int __C)
+{
+  __builtin_ia32_mwaitx (__E, __H, __C);
+}
+
+#ifdef __DISABLE_MWAITX__
+#undef __DISABLE_MWAITX__
+#pragma GCC pop_options
+#endif /* __DISABLE_MWAITX__ */
+
+#endif /* _MWAITXINTRIN_H_INCLUDED */
diff --git a/include-gcc/pconfigintrin.h b/include-gcc/pconfigintrin.h
new file mode 100644 (file)
index 0000000..bd8252a
--- /dev/null
@@ -0,0 +1,78 @@
+/* Copyright (C) 2018-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <pconfigintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _PCONFIGINTRIN_H_INCLUDED
+#define _PCONFIGINTRIN_H_INCLUDED
+
+#ifndef __PCONFIG__
+#pragma GCC push_options
+#pragma GCC target("pconfig")
+#define __DISABLE_PCONFIG__
+#endif /* __PCONFIG__ */
+
+#define __pconfig_b(leaf, b, retval)                   \
+  __asm__ __volatile__ ("pconfig\n\t"                  \
+       : "=a" (retval)                                 \
+       : "a" (leaf), "b" (b)                           \
+       : "cc")
+
+#define __pconfig_generic(leaf, b, c, d, retval)       \
+  __asm__ __volatile__ ("pconfig\n\t"                  \
+       : "=a" (retval), "=b" (b), "=c" (c), "=d" (d)   \
+       : "a" (leaf), "b" (b), "c" (c), "d" (d)         \
+       : "cc")
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pconfig_u32 (const unsigned int __L, size_t __D[])
+{
+  enum __pconfig_type
+  {
+    __PCONFIG_KEY_PROGRAM = 0x01,
+  };
+
+  unsigned int __R = 0;
+
+  if (!__builtin_constant_p (__L))
+    __pconfig_generic (__L, __D[0], __D[1], __D[2], __R);
+  else switch (__L)
+    {
+    case __PCONFIG_KEY_PROGRAM:
+      __pconfig_b (__L, __D[0], __R);
+      break;
+    default:
+      __pconfig_generic (__L, __D[0], __D[1], __D[2], __R);
+    }
+  return __R;
+}
+
+#ifdef __DISABLE_PCONFIG__
+#undef __DISABLE_PCONFIG__
+#pragma GCC pop_options
+#endif /* __DISABLE_PCONFIG__ */
+
+#endif /* _PCONFIGINTRIN_H_INCLUDED */
diff --git a/include-gcc/pkuintrin.h b/include-gcc/pkuintrin.h
new file mode 100644 (file)
index 0000000..257b5b8
--- /dev/null
@@ -0,0 +1,56 @@
+/* Copyright (C) 2015-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <pkuintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _PKUINTRIN_H_INCLUDED
+#define _PKUINTRIN_H_INCLUDED
+
+#ifndef __PKU__
+#pragma GCC push_options
+#pragma GCC target("pku")
+#define __DISABLE_PKU__
+#endif /* __PKU__ */
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_rdpkru_u32 (void)
+{
+  return __builtin_ia32_rdpkru ();
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_wrpkru (unsigned int __key)
+{
+  __builtin_ia32_wrpkru (__key);
+}
+
+#ifdef __DISABLE_PKU__
+#undef __DISABLE_PKU__
+#pragma GCC pop_options
+#endif /* __DISABLE_PKU__ */
+
+#endif /* _PKUINTRIN_H_INCLUDED */
diff --git a/include-gcc/pmmintrin.h b/include-gcc/pmmintrin.h
new file mode 100644 (file)
index 0000000..b2674ec
--- /dev/null
@@ -0,0 +1,121 @@
+/* Copyright (C) 2003-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+   User Guide and Reference, version 9.0.  */
+
+#ifndef _PMMINTRIN_H_INCLUDED
+#define _PMMINTRIN_H_INCLUDED
+
+/* We need definitions from the SSE2 and SSE header files*/
+#include <emmintrin.h>
+#include <mwaitintrin.h>
+
+#ifndef __SSE3__
+#pragma GCC push_options
+#pragma GCC target("sse3")
+#define __DISABLE_SSE3__
+#endif /* __SSE3__ */
+
+/* Additional bits in the MXCSR.  */
+#define _MM_DENORMALS_ZERO_MASK                0x0040
+#define _MM_DENORMALS_ZERO_ON          0x0040
+#define _MM_DENORMALS_ZERO_OFF         0x0000
+
+#define _MM_SET_DENORMALS_ZERO_MODE(mode) \
+  _mm_setcsr ((_mm_getcsr () & ~_MM_DENORMALS_ZERO_MASK) | (mode))
+#define _MM_GET_DENORMALS_ZERO_MODE() \
+  (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_addsub_ps (__m128 __X, __m128 __Y)
+{
+  return (__m128) __builtin_ia32_addsubps ((__v4sf)__X, (__v4sf)__Y);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_ps (__m128 __X, __m128 __Y)
+{
+  return (__m128) __builtin_ia32_haddps ((__v4sf)__X, (__v4sf)__Y);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_ps (__m128 __X, __m128 __Y)
+{
+  return (__m128) __builtin_ia32_hsubps ((__v4sf)__X, (__v4sf)__Y);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movehdup_ps (__m128 __X)
+{
+  return (__m128) __builtin_ia32_movshdup ((__v4sf)__X);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_moveldup_ps (__m128 __X)
+{
+  return (__m128) __builtin_ia32_movsldup ((__v4sf)__X);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_addsub_pd (__m128d __X, __m128d __Y)
+{
+  return (__m128d) __builtin_ia32_addsubpd ((__v2df)__X, (__v2df)__Y);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_pd (__m128d __X, __m128d __Y)
+{
+  return (__m128d) __builtin_ia32_haddpd ((__v2df)__X, (__v2df)__Y);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_pd (__m128d __X, __m128d __Y)
+{
+  return (__m128d) __builtin_ia32_hsubpd ((__v2df)__X, (__v2df)__Y);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loaddup_pd (double const *__P)
+{
+  return _mm_load1_pd (__P);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movedup_pd (__m128d __X)
+{
+  return _mm_shuffle_pd (__X, __X, _MM_SHUFFLE2 (0,0));
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_lddqu_si128 (__m128i const *__P)
+{
+  return (__m128i) __builtin_ia32_lddqu ((char const *)__P);
+}
+
+#ifdef __DISABLE_SSE3__
+#undef __DISABLE_SSE3__
+#pragma GCC pop_options
+#endif /* __DISABLE_SSE3__ */
+
+#endif /* _PMMINTRIN_H_INCLUDED */
diff --git a/include-gcc/popcntintrin.h b/include-gcc/popcntintrin.h
new file mode 100644 (file)
index 0000000..b039d5f
--- /dev/null
@@ -0,0 +1,53 @@
+/* Copyright (C) 2009-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _POPCNTINTRIN_H_INCLUDED
+#define _POPCNTINTRIN_H_INCLUDED
+
+#ifndef __POPCNT__
+#pragma GCC push_options
+#pragma GCC target("popcnt")
+#define __DISABLE_POPCNT__
+#endif /* __POPCNT__ */
+
+/* Calculate a number of bits set to 1.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_popcnt_u32 (unsigned int __X)
+{
+  return __builtin_popcount (__X);
+}
+
+#ifdef __x86_64__
+extern __inline long long  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_popcnt_u64 (unsigned long long __X)
+{
+  return __builtin_popcountll (__X);
+}
+#endif
+
+#ifdef __DISABLE_POPCNT__
+#undef __DISABLE_POPCNT__
+#pragma GCC pop_options
+#endif  /* __DISABLE_POPCNT__ */
+
+#endif /* _POPCNTINTRIN_H_INCLUDED */
diff --git a/include-gcc/prfchiintrin.h b/include-gcc/prfchiintrin.h
new file mode 100644 (file)
index 0000000..382fc07
--- /dev/null
@@ -0,0 +1,61 @@
+/* Copyright (C) 2022-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if !defined _X86GPRINTRIN_H_INCLUDED
+# error "Never use <prfchiintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _PRFCHIINTRIN_H_INCLUDED
+#define _PRFCHIINTRIN_H_INCLUDED
+
+#ifdef __x86_64__
+
+
+#ifndef __PREFETCHI__
+#pragma GCC push_options
+#pragma GCC target("prefetchi")
+#define __DISABLE_PREFETCHI__
+#endif /* __PREFETCHI__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_prefetchit0 (void* __P)
+{
+  __builtin_ia32_prefetchi (__P, 3);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_prefetchit1 (void* __P)
+{
+  __builtin_ia32_prefetchi (__P, 2);
+}
+
+#ifdef __DISABLE_PREFETCHI__
+#undef __DISABLE_PREFETCHI__
+#pragma GCC pop_options
+#endif /* __DISABLE_PREFETCHI__ */
+
+#endif /* __x86_64__ */
+
+#endif /* _PRFCHIINTRIN_H_INCLUDED */
diff --git a/include-gcc/prfchwintrin.h b/include-gcc/prfchwintrin.h
new file mode 100644 (file)
index 0000000..f652997
--- /dev/null
@@ -0,0 +1,37 @@
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if !defined _IMMINTRIN_H_INCLUDED && !defined _MM3DNOW_H_INCLUDED
+# error "Never use <prfchwintrin.h> directly; include <immintrin.h> or <mm3dnow.h> instead."
+#endif
+
+#ifndef _PRFCHWINTRIN_H_INCLUDED
+#define _PRFCHWINTRIN_H_INCLUDED
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_prefetchw (void *__P)
+{
+  __builtin_prefetch (__P, 1, 3 /* _MM_HINT_T0 */);
+}
+
+#endif /* _PRFCHWINTRIN_H_INCLUDED */
diff --git a/include-gcc/raointintrin.h b/include-gcc/raointintrin.h
new file mode 100644 (file)
index 0000000..ad9fbaf
--- /dev/null
@@ -0,0 +1,100 @@
+/* Copyright (C) 2019-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+#error "Never use <raointintrin.h> directly; include <x86gprintrin.h> instead."
+#endif // _X86GPRINTRIN_H_INCLUDED
+
+#ifndef __RAOINTINTRIN_H_INCLUDED
+#define __RAOINTINTRIN_H_INCLUDED
+
+#ifndef __RAOINT__
+#pragma GCC push_options
+#pragma GCC target("raoint")
+#define __DISABLE_RAOINT__
+#endif /* __RAOINT__ */
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_aadd_i32 (int *__A, int __B)
+{
+  __builtin_ia32_aadd32 ((int *)__A, __B);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_aand_i32 (int *__A, int __B)
+{
+  __builtin_ia32_aand32 ((int *)__A, __B);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_aor_i32 (int *__A, int __B)
+{
+  __builtin_ia32_aor32 ((int *)__A, __B);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_axor_i32 (int *__A, int __B)
+{
+  __builtin_ia32_axor32 ((int *)__A, __B);
+}
+
+#ifdef __x86_64__
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_aadd_i64 (long long *__A, long long __B)
+{
+  __builtin_ia32_aadd64 ((long long *)__A, __B);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_aand_i64 (long long *__A, long long __B)
+{
+  __builtin_ia32_aand64 ((long long *)__A, __B);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_aor_i64 (long long *__A, long long __B)
+{
+  __builtin_ia32_aor64 ((long long *)__A, __B);
+}
+
+extern __inline void
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_axor_i64 (long long *__A, long long __B)
+{
+  __builtin_ia32_axor64 ((long long *)__A, __B);
+}
+#endif /* __x86_64__ */
+
+#ifdef __DISABLE_RAOINT__
+#undef __DISABLE_RAOINT__
+#pragma GCC pop_options
+#endif /* __DISABLE_RAOINT__ */
+
+#endif /* __RAOINTINTRIN_H_INCLUDED */
diff --git a/include-gcc/rdseedintrin.h b/include-gcc/rdseedintrin.h
new file mode 100644 (file)
index 0000000..50f45bd
--- /dev/null
@@ -0,0 +1,66 @@
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <rdseedintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _RDSEEDINTRIN_H_INCLUDED
+#define _RDSEEDINTRIN_H_INCLUDED
+
+#ifndef __RDSEED__
+#pragma GCC push_options
+#pragma GCC target("rdseed")
+#define __DISABLE_RDSEED__
+#endif /* __RDSEED__ */
+
+
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_rdseed16_step (unsigned short *__p)
+{
+  return __builtin_ia32_rdseed_hi_step (__p);
+}
+
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_rdseed32_step (unsigned int *__p)
+{
+  return __builtin_ia32_rdseed_si_step (__p);
+}
+
+#ifdef __x86_64__
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_rdseed64_step (unsigned long long *__p)
+{
+  return __builtin_ia32_rdseed_di_step (__p);
+}
+#endif
+
+#ifdef __DISABLE_RDSEED__
+#undef __DISABLE_RDSEED__
+#pragma GCC pop_options
+#endif /* __DISABLE_RDSEED__ */
+
+#endif /* _RDSEEDINTRIN_H_INCLUDED */
diff --git a/include-gcc/rtmintrin.h b/include-gcc/rtmintrin.h
new file mode 100644 (file)
index 0000000..f722f5b
--- /dev/null
@@ -0,0 +1,84 @@
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <rtmintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _RTMINTRIN_H_INCLUDED
+#define _RTMINTRIN_H_INCLUDED
+
+#ifndef __RTM__
+#pragma GCC push_options
+#pragma GCC target("rtm")
+#define __DISABLE_RTM__
+#endif /* __RTM__ */
+
+#define _XBEGIN_STARTED                (~0u)
+#define _XABORT_EXPLICIT       (1 << 0)
+#define _XABORT_RETRY          (1 << 1)
+#define _XABORT_CONFLICT       (1 << 2)
+#define _XABORT_CAPACITY       (1 << 3)
+#define _XABORT_DEBUG          (1 << 4)
+#define _XABORT_NESTED         (1 << 5)
+#define _XABORT_CODE(x)                (((x) >> 24) & 0xFF)
+
+/* Start an RTM code region.  Return _XBEGIN_STARTED on success and the
+   abort condition otherwise.  */
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xbegin (void)
+{
+  return __builtin_ia32_xbegin ();
+}
+
+/* Specify the end of an RTM code region.  If it corresponds to the
+   outermost transaction, then attempts the transaction commit.  If the
+   commit fails, then control is transferred to the outermost transaction
+   fallback handler.  */
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xend (void)
+{
+  __builtin_ia32_xend ();
+}
+
+/* Force an RTM abort condition. The control is transferred to the
+   outermost transaction fallback handler with the abort condition IMM.  */
+#ifdef __OPTIMIZE__
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xabort (const unsigned int __imm)
+{
+  __builtin_ia32_xabort (__imm);
+}
+#else
+#define _xabort(N)  __builtin_ia32_xabort (N)
+#endif /* __OPTIMIZE__ */
+
+#ifdef __DISABLE_RTM__
+#undef __DISABLE_RTM__
+#pragma GCC pop_options
+#endif /* __DISABLE_RTM__ */
+
+#endif /* _RTMINTRIN_H_INCLUDED */
diff --git a/include-gcc/serializeintrin.h b/include-gcc/serializeintrin.h
new file mode 100644 (file)
index 0000000..d5da003
--- /dev/null
@@ -0,0 +1,49 @@
+/* Copyright (C) 2018-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <serializeintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _SERIALIZE_H_INCLUDED
+#define _SERIALIZE_H_INCLUDED
+
+#ifndef __SERIALIZE__
+#pragma GCC push_options
+#pragma GCC target("serialize")
+#define __DISABLE_SERIALIZE__
+#endif /* __SERIALIZE__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_serialize (void)
+{
+  __builtin_ia32_serialize ();
+}
+
+#ifdef __DISABLE_SERIALIZE__
+#undef __DISABLE_SERIALIZE__
+#pragma GCC pop_options
+#endif /* __DISABLE_SERIALIZE__ */
+
+#endif /* _SERIALIZE_H_INCLUDED.  */
diff --git a/include-gcc/sgxintrin.h b/include-gcc/sgxintrin.h
new file mode 100644 (file)
index 0000000..e12fa16
--- /dev/null
@@ -0,0 +1,253 @@
+/* Copyright (C) 2017-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _SGXINTRIN_H_INCLUDED
+#define _SGXINTRIN_H_INCLUDED
+
+#ifndef __SGX__
+#pragma GCC push_options
+#pragma GCC target("sgx")
+#define __DISABLE_SGX__
+#endif /* __SGX__ */
+
+#define __encls_bc(leaf, b, c, retval)                 \
+  __asm__ __volatile__ ("encls\n\t"                    \
+          : "=a" (retval)                              \
+          : "a" (leaf), "b" (b), "c" (c)               \
+          : "cc")
+
+#define __encls_bcd(leaf, b, c, d, retval)             \
+  __asm__ __volatile__("encls\n\t"                     \
+          : "=a" (retval)                              \
+          : "a" (leaf), "b" (b), "c" (c), "d" (d)      \
+          : "cc")
+
+#define __encls_c(leaf, c, retval)                     \
+  __asm__ __volatile__("encls\n\t"                     \
+          : "=a" (retval)                              \
+          : "a" (leaf), "c" (c)                        \
+          : "cc")
+
+#define __encls_edbgrd(leaf, b, c, retval)             \
+  __asm__ __volatile__("encls\n\t"                     \
+          : "=a" (retval), "=b" (b)                    \
+          : "a" (leaf), "c" (c))
+
+#define __encls_generic(leaf, b, c, d, retval)         \
+  __asm__ __volatile__("encls\n\t"                     \
+          : "=a" (retval), "=b" (b), "=c" (c), "=d" (d)\
+          : "a" (leaf), "b" (b), "c" (c), "d" (d)      \
+          : "cc")
+
+#define __enclu_bc(leaf, b, c, retval)                 \
+  __asm__ __volatile__("enclu\n\t"                     \
+          : "=a" (retval)                              \
+          : "a" (leaf), "b" (b), "c" (c)               \
+          : "cc")
+
+#define __enclu_bcd(leaf, b, c, d, retval)             \
+  __asm__ __volatile__("enclu\n\t"                     \
+          : "=a" (retval)                              \
+          : "a" (leaf), "b" (b), "c" (c), "d" (d)      \
+          : "cc")
+
+#define __enclu_eenter(leaf, b, c, retval)             \
+  __asm__  __volatile__("enclu\n\t"                    \
+          : "=a" (retval), "=c" (c)                    \
+          : "a" (leaf), "b" (b), "c" (c)               \
+          : "cc")
+
+#define __enclu_eexit(leaf, b, c, retval)              \
+  __asm__  __volatile__("enclu\n\t"                    \
+          : "=a" (retval), "=c" (c)                    \
+          : "a" (leaf), "b" (b)                        \
+          : "cc")
+
+#define __enclu_generic(leaf, b, c, d, retval)         \
+  __asm__ __volatile__("enclu\n\t"                     \
+          : "=a" (retval), "=b" (b), "=c" (c), "=d" (d)\
+          : "a" (leaf), "b" (b), "c" (c), "d" (d)      \
+          : "cc")
+
+#define __enclv_bc(leaf, b, c, retval)                 \
+  __asm__ __volatile__("enclv\n\t"                     \
+          : "=a" (retval)                              \
+          : "a" (leaf), "b" (b), "c" (c)               \
+          : "cc")
+
+#define __enclv_cd(leaf, c, d, retval)                 \
+  __asm__ __volatile__("enclv\n\t"                     \
+          : "=a" (retval)                              \
+          : "a" (leaf), "c" (c), "d" (d)               \
+          : "cc")
+
+#define __enclv_generic(leaf, b, c, d, retval)         \
+  __asm__ __volatile__("enclv\n\t"                     \
+          : "=a" (retval), "=b" (b), "=c" (b), "=d" (d)\
+          : "a" (leaf), "b" (b), "c" (c), "d" (d)      \
+          : "cc")
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_encls_u32 (const unsigned int __L, size_t __D[])
+{
+  enum __encls_type
+  {
+    __SGX_ECREATE = 0x00,
+    __SGX_EADD    = 0x01,
+    __SGX_EINIT   = 0x02,
+    __SGX_EREMOVE = 0x03,
+    __SGX_EDBGRD  = 0x04,
+    __SGX_EDBGWR  = 0x05,
+    __SGX_EEXTEND = 0x06,
+    __SGX_ELDB    = 0x07,
+    __SGX_ELDU    = 0x08,
+    __SGX_EBLOCK  = 0x09,
+    __SGX_EPA     = 0x0A,
+    __SGX_EWB     = 0x0B,
+    __SGX_ETRACK  = 0x0C,
+    __SGX_EAUG    = 0x0D,
+    __SGX_EMODPR  = 0x0E,
+    __SGX_EMODT   = 0x0F,
+    __SGX_ERDINFO = 0x10,
+    __SGX_ETRACKC = 0x11,
+    __SGX_ELDBC   = 0x12,
+    __SGX_ELDUC   = 0x13
+  };
+  enum __encls_type __T = (enum __encls_type)__L;
+  unsigned int __R = 0;
+  if (!__builtin_constant_p (__T))
+    __encls_generic (__L, __D[0], __D[1], __D[2], __R);
+  else switch (__T)
+    {
+    case __SGX_ECREATE:
+    case __SGX_EADD:
+    case __SGX_EDBGWR:
+    case __SGX_EEXTEND:
+    case __SGX_EPA:
+    case __SGX_EMODPR:
+    case __SGX_EMODT:
+    case __SGX_EAUG:
+    case __SGX_ERDINFO:
+      __encls_bc (__L, __D[0], __D[1], __R);
+      break;
+    case __SGX_EINIT:
+    case __SGX_ELDB:
+    case __SGX_ELDU:
+    case __SGX_EWB:
+    case __SGX_ELDBC:
+    case __SGX_ELDUC:
+      __encls_bcd (__L, __D[0], __D[1], __D[2], __R);
+      break;
+    case __SGX_EREMOVE:
+    case __SGX_EBLOCK:
+    case __SGX_ETRACK:
+    case __SGX_ETRACKC:
+      __encls_c (__L, __D[1], __R);
+      break;
+    case __SGX_EDBGRD:
+      __encls_edbgrd (__L, __D[0], __D[1], __R);
+      break;
+    default:
+      __encls_generic (__L, __D[0], __D[1], __D[2], __R);
+    }
+  return __R;
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_enclu_u32 (const unsigned int __L, size_t __D[])
+{
+  enum __enclu_type
+  {
+    __SGX_EREPORT     = 0x00,
+    __SGX_EGETKEY     = 0x01,
+    __SGX_EENTER      = 0x02,
+    __SGX_ERESUME     = 0x03,
+    __SGX_EEXIT       = 0x04,
+    __SGX_EACCEPT     = 0x05,
+    __SGX_EMODPE      = 0x06,
+    __SGX_EACCEPTCOPY = 0x07
+  };
+  enum __enclu_type __T = (enum __enclu_type) __L;
+  unsigned int __R = 0;
+  if (!__builtin_constant_p (__T))
+    __enclu_generic (__L, __D[0], __D[1], __D[2], __R);
+  else switch (__T)
+    {
+    case __SGX_EREPORT:
+    case __SGX_EACCEPTCOPY:
+      __enclu_bcd (__L, __D[0], __D[1], __D[2], __R);
+      break;
+    case __SGX_EGETKEY:
+    case __SGX_ERESUME:
+    case __SGX_EACCEPT:
+    case __SGX_EMODPE:
+      __enclu_bc (__L, __D[0], __D[1], __R);
+      break;
+    case __SGX_EENTER:
+      __enclu_eenter (__L, __D[0], __D[1], __R);
+      break;
+    case __SGX_EEXIT:
+      __enclu_eexit (__L, __D[0], __D[1], __R);
+      break;
+    default:
+      __enclu_generic (__L, __D[0], __D[1], __D[2], __R);
+    }
+  return __R;
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_enclv_u32 (const unsigned int __L, size_t __D[])
+{
+  enum __enclv_type
+  {
+    __SGX_EDECVIRTCHILD = 0x00,
+    __SGX_EINCVIRTCHILD = 0x01,
+    __SGX_ESETCONTEXT   = 0x02
+  };
+  unsigned int __R = 0;
+  if (!__builtin_constant_p (__L))
+    __enclv_generic (__L, __D[0], __D[1], __D[2], __R);
+  else switch (__L)
+    {
+    case __SGX_EDECVIRTCHILD:
+    case __SGX_EINCVIRTCHILD:
+      __enclv_bc (__L, __D[0], __D[1], __R);
+      break;
+    case __SGX_ESETCONTEXT:
+      __enclv_cd (__L, __D[1], __D[2], __R);
+      break;
+    default:
+      __enclv_generic (__L, __D[0], __D[1], __D[2], __R);
+    }
+  return __R;
+}
+
+#ifdef __DISABLE_SGX__
+#undef __DISABLE_SGX__
+#pragma GCC pop_options
+#endif /* __DISABLE_SGX__ */
+
+#endif /* _SGXINTRIN_H_INCLUDED */
diff --git a/include-gcc/shaintrin.h b/include-gcc/shaintrin.h
new file mode 100644 (file)
index 0000000..ea85e31
--- /dev/null
@@ -0,0 +1,98 @@
+/* Copyright (C) 2013-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <shaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _SHAINTRIN_H_INCLUDED
+#define _SHAINTRIN_H_INCLUDED
+
+#ifndef __SHA__
+#pragma GCC push_options
+#pragma GCC target("sha")
+#define __DISABLE_SHA__
+#endif /* __SHA__ */
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha1msg1_epu32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_sha1msg1 ((__v4si) __A, (__v4si) __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha1msg2_epu32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_sha1msg2 ((__v4si) __A, (__v4si) __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha1nexte_epu32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_sha1nexte ((__v4si) __A, (__v4si) __B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha1rnds4_epu32 (__m128i __A, __m128i __B, const int __I)
+{
+  return (__m128i) __builtin_ia32_sha1rnds4 ((__v4si) __A, (__v4si) __B, __I);
+}
+#else
+#define _mm_sha1rnds4_epu32(A, B, I)                               \
+  ((__m128i) __builtin_ia32_sha1rnds4 ((__v4si)(__m128i)(A),       \
+                                      (__v4si)(__m128i)(B), (int)(I)))
+#endif
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha256msg1_epu32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_sha256msg1 ((__v4si) __A, (__v4si) __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha256msg2_epu32 (__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_sha256msg2 ((__v4si) __A, (__v4si) __B);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha256rnds2_epu32 (__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i) __builtin_ia32_sha256rnds2 ((__v4si) __A, (__v4si) __B,
+                                              (__v4si) __C);
+}
+
+#ifdef __DISABLE_SHA__
+#undef __DISABLE_SHA__
+#pragma GCC pop_options
+#endif /* __DISABLE_SHA__ */
+
+#endif /* _SHAINTRIN_H_INCLUDED */
diff --git a/include-gcc/smmintrin.h b/include-gcc/smmintrin.h
new file mode 100644 (file)
index 0000000..1605acb
--- /dev/null
@@ -0,0 +1,852 @@
+/* Copyright (C) 2007-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+   User Guide and Reference, version 10.0.  */
+
+#ifndef _SMMINTRIN_H_INCLUDED
+#define _SMMINTRIN_H_INCLUDED
+
+/* We need definitions from the SSSE3, SSE3, SSE2 and SSE header
+   files.  */
+#include <tmmintrin.h>
+
+#ifndef __SSE4_1__
+#pragma GCC push_options
+#pragma GCC target("sse4.1")
+#define __DISABLE_SSE4_1__
+#endif /* __SSE4_1__ */
+
+/* Rounding mode macros. */
+#define _MM_FROUND_TO_NEAREST_INT      0x00
+#define _MM_FROUND_TO_NEG_INF          0x01
+#define _MM_FROUND_TO_POS_INF          0x02
+#define _MM_FROUND_TO_ZERO             0x03
+#define _MM_FROUND_CUR_DIRECTION       0x04
+
+#define _MM_FROUND_RAISE_EXC           0x00
+#define _MM_FROUND_NO_EXC              0x08
+
+#define _MM_FROUND_NINT                \
+  (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_FLOOR       \
+  (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_CEIL                \
+  (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_TRUNC       \
+  (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_RINT                \
+  (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_NEARBYINT   \
+  (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
+
+/* Test Instruction */
+/* Packed integer 128-bit bitwise comparison. Return 1 if
+   (__V & __M) == 0.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testz_si128 (__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestz128 ((__v2di)__M, (__v2di)__V);
+}
+
+/* Packed integer 128-bit bitwise comparison. Return 1 if
+   (__V & ~__M) == 0.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testc_si128 (__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestc128 ((__v2di)__M, (__v2di)__V);
+}
+
+/* Packed integer 128-bit bitwise comparison. Return 1 if
+   (__V & __M) != 0 && (__V & ~__M) != 0.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testnzc_si128 (__m128i __M, __m128i __V)
+{
+  return __builtin_ia32_ptestnzc128 ((__v2di)__M, (__v2di)__V);
+}
+
+/* Macros for packed integer 128-bit comparison intrinsics.  */
+#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
+
+#define _mm_test_all_ones(V) \
+  _mm_testc_si128 ((V), _mm_cmpeq_epi32 ((V), (V)))
+
+#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128 ((M), (V))
+
+/* Packed/scalar double precision floating point rounding.  */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_round_pd (__m128d __V, const int __M)
+{
+  return (__m128d) __builtin_ia32_roundpd ((__v2df)__V, __M);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_round_sd(__m128d __D, __m128d __V, const int __M)
+{
+  return (__m128d) __builtin_ia32_roundsd ((__v2df)__D,
+                                          (__v2df)__V,
+                                          __M);
+}
+#else
+#define _mm_round_pd(V, M) \
+  ((__m128d) __builtin_ia32_roundpd ((__v2df)(__m128d)(V), (int)(M)))
+
+#define _mm_round_sd(D, V, M)                                          \
+  ((__m128d) __builtin_ia32_roundsd ((__v2df)(__m128d)(D),             \
+                                    (__v2df)(__m128d)(V), (int)(M)))
+#endif
+
+/* Packed/scalar single precision floating point rounding.  */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_round_ps (__m128 __V, const int __M)
+{
+  return (__m128) __builtin_ia32_roundps ((__v4sf)__V, __M);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_round_ss (__m128 __D, __m128 __V, const int __M)
+{
+  return (__m128) __builtin_ia32_roundss ((__v4sf)__D,
+                                         (__v4sf)__V,
+                                         __M);
+}
+#else
+#define _mm_round_ps(V, M) \
+  ((__m128) __builtin_ia32_roundps ((__v4sf)(__m128)(V), (int)(M)))
+
+#define _mm_round_ss(D, V, M)                                          \
+  ((__m128) __builtin_ia32_roundss ((__v4sf)(__m128)(D),               \
+                                   (__v4sf)(__m128)(V), (int)(M)))
+#endif
+
+/* Macros for ceil/floor intrinsics.  */
+#define _mm_ceil_pd(V)    _mm_round_pd ((V), _MM_FROUND_CEIL)
+#define _mm_ceil_sd(D, V)  _mm_round_sd ((D), (V), _MM_FROUND_CEIL)
+
+#define _mm_floor_pd(V)           _mm_round_pd((V), _MM_FROUND_FLOOR)
+#define _mm_floor_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_FLOOR)
+
+#define _mm_ceil_ps(V)    _mm_round_ps ((V), _MM_FROUND_CEIL)
+#define _mm_ceil_ss(D, V)  _mm_round_ss ((D), (V), _MM_FROUND_CEIL)
+
+#define _mm_floor_ps(V)           _mm_round_ps ((V), _MM_FROUND_FLOOR)
+#define _mm_floor_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_FLOOR)
+
+/* SSE4.1 */
+
+/* Integer blend instructions - select data from 2 sources using
+   constant/variable mask.  */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blend_epi16 (__m128i __X, __m128i __Y, const int __M)
+{
+  return (__m128i) __builtin_ia32_pblendw128 ((__v8hi)__X,
+                                             (__v8hi)__Y,
+                                             __M);
+}
+#else
+#define _mm_blend_epi16(X, Y, M)                                       \
+  ((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(X),          \
+                                       (__v8hi)(__m128i)(Y), (int)(M)))
+#endif
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blendv_epi8 (__m128i __X, __m128i __Y, __m128i __M)
+{
+  return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__X,
+                                              (__v16qi)__Y,
+                                              (__v16qi)__M);
+}
+
+/* Single precision floating point blend instructions - select data
+   from 2 sources using constant/variable mask.  */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blend_ps (__m128 __X, __m128 __Y, const int __M)
+{
+  return (__m128) __builtin_ia32_blendps ((__v4sf)__X,
+                                         (__v4sf)__Y,
+                                         __M);
+}
+#else
+#define _mm_blend_ps(X, Y, M)                                          \
+  ((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(X),               \
+                                   (__v4sf)(__m128)(Y), (int)(M)))
+#endif
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blendv_ps (__m128 __X, __m128 __Y, __m128 __M)
+{
+  return (__m128) __builtin_ia32_blendvps ((__v4sf)__X,
+                                          (__v4sf)__Y,
+                                          (__v4sf)__M);
+}
+
+/* Double precision floating point blend instructions - select data
+   from 2 sources using constant/variable mask.  */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blend_pd (__m128d __X, __m128d __Y, const int __M)
+{
+  return (__m128d) __builtin_ia32_blendpd ((__v2df)__X,
+                                          (__v2df)__Y,
+                                          __M);
+}
+#else
+#define _mm_blend_pd(X, Y, M)                                          \
+  ((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(X),             \
+                                    (__v2df)(__m128d)(Y), (int)(M)))
+#endif
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blendv_pd (__m128d __X, __m128d __Y, __m128d __M)
+{
+  return (__m128d) __builtin_ia32_blendvpd ((__v2df)__X,
+                                           (__v2df)__Y,
+                                           (__v2df)__M);
+}
+
+/* Dot product instructions with mask-defined summing and zeroing parts
+   of result.  */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_dp_ps (__m128 __X, __m128 __Y, const int __M)
+{
+  return (__m128) __builtin_ia32_dpps ((__v4sf)__X,
+                                      (__v4sf)__Y,
+                                      __M);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_dp_pd (__m128d __X, __m128d __Y, const int __M)
+{
+  return (__m128d) __builtin_ia32_dppd ((__v2df)__X,
+                                       (__v2df)__Y,
+                                       __M);
+}
+#else
+#define _mm_dp_ps(X, Y, M)                                             \
+  ((__m128) __builtin_ia32_dpps ((__v4sf)(__m128)(X),                  \
+                                (__v4sf)(__m128)(Y), (int)(M)))
+
+#define _mm_dp_pd(X, Y, M)                                             \
+  ((__m128d) __builtin_ia32_dppd ((__v2df)(__m128d)(X),                        \
+                                 (__v2df)(__m128d)(Y), (int)(M)))
+#endif
+
+/* Packed integer 64-bit comparison, zeroing or filling with ones
+   corresponding parts of result.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi64 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) ((__v2di)__X == (__v2di)__Y);
+}
+
+/*  Min/max packed integer instructions.  */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epi8 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pminsb128 ((__v16qi)__X, (__v16qi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epi8 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi)__X, (__v16qi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epu16 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pminuw128 ((__v8hi)__X, (__v8hi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epu16 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi)__X, (__v8hi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pminsd128 ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epu32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pminud128 ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epu32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pmaxud128 ((__v4si)__X, (__v4si)__Y);
+}
+
+/* Packed integer 32-bit multiplication with truncation of upper
+   halves of results.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mullo_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) ((__v4su)__X * (__v4su)__Y);
+}
+
+/* Packed integer 32-bit multiplication of 2 pairs of operands
+   with two 64-bit results.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__X, (__v4si)__Y);
+}
+
+/* Insert single precision float into packed single precision array
+   element selected by index N.  The bits [7-6] of N define S
+   index, the bits [5-4] define D index, and bits [3-0] define
+   zeroing mask for D.  */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_ps (__m128 __D, __m128 __S, const int __N)
+{
+  return (__m128) __builtin_ia32_insertps128 ((__v4sf)__D,
+                                             (__v4sf)__S,
+                                             __N);
+}
+#else
+#define _mm_insert_ps(D, S, N)                                         \
+  ((__m128) __builtin_ia32_insertps128 ((__v4sf)(__m128)(D),           \
+                                       (__v4sf)(__m128)(S), (int)(N)))
+#endif
+
+/* Helper macro to create the N value for _mm_insert_ps.  */
+#define _MM_MK_INSERTPS_NDX(S, D, M) (((S) << 6) | ((D) << 4) | (M))
+
+/* Extract binary representation of single precision float from packed
+   single precision array element of X selected by index N.  */
+
+#ifdef __OPTIMIZE__
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_ps (__m128 __X, const int __N)
+{
+  union { int __i; float __f; } __tmp;
+  __tmp.__f = __builtin_ia32_vec_ext_v4sf ((__v4sf)__X, __N);
+  return __tmp.__i;
+}
+#else
+#define _mm_extract_ps(X, N)                                           \
+  (__extension__                                                       \
+   ({                                                                  \
+     union { int __i; float __f; } __tmp;                              \
+     __tmp.__f = __builtin_ia32_vec_ext_v4sf ((__v4sf)(__m128)(X),     \
+                                             (int)(N));                \
+     __tmp.__i;                                                                \
+   }))
+#endif
+
+/* Extract binary representation of single precision float into
+   D from packed single precision array element of S selected
+   by index N.  */
+#define _MM_EXTRACT_FLOAT(D, S, N) \
+  { (D) = __builtin_ia32_vec_ext_v4sf ((__v4sf)(S), (N)); }
+  
+/* Extract specified single precision float element into the lower
+   part of __m128.  */
+#define _MM_PICK_OUT_PS(X, N)                          \
+  _mm_insert_ps (_mm_setzero_ps (), (X),               \
+                _MM_MK_INSERTPS_NDX ((N), 0, 0x0e))
+
+/* Insert integer, S, into packed integer array element of D
+   selected by index N.  */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi8 (__m128i __D, int __S, const int __N)
+{
+  return (__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)__D,
+                                                __S, __N);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi32 (__m128i __D, int __S, const int __N)
+{
+  return (__m128i) __builtin_ia32_vec_set_v4si ((__v4si)__D,
+                                                __S, __N);
+}
+
+#ifdef __x86_64__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi64 (__m128i __D, long long __S, const int __N)
+{
+  return (__m128i) __builtin_ia32_vec_set_v2di ((__v2di)__D,
+                                                __S, __N);
+}
+#endif
+#else
+#define _mm_insert_epi8(D, S, N)                                       \
+  ((__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)(__m128i)(D),      \
+                                          (int)(S), (int)(N)))
+
+#define _mm_insert_epi32(D, S, N)                              \
+  ((__m128i) __builtin_ia32_vec_set_v4si ((__v4si)(__m128i)(D),        \
+                                         (int)(S), (int)(N)))
+
+#ifdef __x86_64__
+#define _mm_insert_epi64(D, S, N)                                      \
+  ((__m128i) __builtin_ia32_vec_set_v2di ((__v2di)(__m128i)(D),                \
+                                         (long long)(S), (int)(N)))
+#endif
+#endif
+
+/* Extract integer from packed integer array element of X selected by
+   index N.  */
+
+#ifdef __OPTIMIZE__
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_epi8 (__m128i __X, const int __N)
+{
+   return (unsigned char) __builtin_ia32_vec_ext_v16qi ((__v16qi)__X, __N);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_epi32 (__m128i __X, const int __N)
+{
+   return __builtin_ia32_vec_ext_v4si ((__v4si)__X, __N);
+}
+
+#ifdef __x86_64__
+extern __inline long long  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_epi64 (__m128i __X, const int __N)
+{
+  return __builtin_ia32_vec_ext_v2di ((__v2di)__X, __N);
+}
+#endif
+#else
+#define _mm_extract_epi8(X, N) \
+  ((int) (unsigned char) __builtin_ia32_vec_ext_v16qi ((__v16qi)(__m128i)(X), (int)(N)))
+#define _mm_extract_epi32(X, N) \
+  ((int) __builtin_ia32_vec_ext_v4si ((__v4si)(__m128i)(X), (int)(N)))
+
+#ifdef __x86_64__
+#define _mm_extract_epi64(X, N) \
+  ((long long) __builtin_ia32_vec_ext_v2di ((__v2di)(__m128i)(X), (int)(N)))
+#endif
+#endif
+
+/* Return horizontal packed word minimum and its index in bits [15:0]
+   and bits [18:16] respectively.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_minpos_epu16 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_phminposuw128 ((__v8hi)__X);
+}
+
+/* Packed integer sign-extension.  */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi8_epi32 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pmovsxbd128 ((__v16qi)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi16_epi32 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pmovsxwd128 ((__v8hi)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi8_epi64 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pmovsxbq128 ((__v16qi)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi32_epi64 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pmovsxdq128 ((__v4si)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi16_epi64 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pmovsxwq128 ((__v8hi)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi8_epi16 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pmovsxbw128 ((__v16qi)__X);
+}
+
+/* Packed integer zero-extension. */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu8_epi32 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pmovzxbd128 ((__v16qi)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu16_epi32 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pmovzxwd128 ((__v8hi)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu8_epi64 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pmovzxbq128 ((__v16qi)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu32_epi64 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pmovzxdq128 ((__v4si)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu16_epi64 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pmovzxwq128 ((__v8hi)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu8_epi16 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pmovzxbw128 ((__v16qi)__X);
+}
+
+/* Pack 8 double words from 2 operands into 8 words of result with
+   unsigned saturation. */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packus_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_packusdw128 ((__v4si)__X, (__v4si)__Y);
+}
+
+/* Sum absolute 8-bit integer difference of adjacent groups of 4
+   byte integers in the first 2 operands.  Starting offsets within
+   operands are determined by the 3rd mask operand.  */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mpsadbw_epu8 (__m128i __X, __m128i __Y, const int __M)
+{
+  return (__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)__X,
+                                             (__v16qi)__Y, __M);
+}
+#else
+#define _mm_mpsadbw_epu8(X, Y, M)                                      \
+  ((__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)(__m128i)(X),         \
+                                       (__v16qi)(__m128i)(Y), (int)(M)))
+#endif
+
+/* Load double quadword using non-temporal aligned hint.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_load_si128 (__m128i *__X)
+{
+  return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __X);
+}
+
+#ifndef __SSE4_2__
+#pragma GCC push_options
+#pragma GCC target("sse4.2")
+#define __DISABLE_SSE4_2__
+#endif /* __SSE4_2__ */
+
+/* These macros specify the source data format.  */
+#define _SIDD_UBYTE_OPS                        0x00
+#define _SIDD_UWORD_OPS                        0x01
+#define _SIDD_SBYTE_OPS                        0x02
+#define _SIDD_SWORD_OPS                        0x03
+
+/* These macros specify the comparison operation.  */
+#define _SIDD_CMP_EQUAL_ANY            0x00
+#define _SIDD_CMP_RANGES               0x04
+#define _SIDD_CMP_EQUAL_EACH           0x08
+#define _SIDD_CMP_EQUAL_ORDERED                0x0c
+
+/* These macros specify the polarity.  */
+#define _SIDD_POSITIVE_POLARITY                0x00
+#define _SIDD_NEGATIVE_POLARITY                0x10
+#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
+#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30
+
+/* These macros specify the output selection in _mm_cmpXstri ().  */
+#define _SIDD_LEAST_SIGNIFICANT                0x00
+#define _SIDD_MOST_SIGNIFICANT         0x40
+
+/* These macros specify the output selection in _mm_cmpXstrm ().  */
+#define _SIDD_BIT_MASK                 0x00
+#define _SIDD_UNIT_MASK                        0x40
+
+/* Intrinsics for text/string processing.  */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpistrm (__m128i __X, __m128i __Y, const int __M)
+{
+  return (__m128i) __builtin_ia32_pcmpistrm128 ((__v16qi)__X,
+                                               (__v16qi)__Y,
+                                               __M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpistri (__m128i __X, __m128i __Y, const int __M)
+{
+  return __builtin_ia32_pcmpistri128 ((__v16qi)__X,
+                                     (__v16qi)__Y,
+                                     __M);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpestrm (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
+{
+  return (__m128i) __builtin_ia32_pcmpestrm128 ((__v16qi)__X, __LX,
+                                               (__v16qi)__Y, __LY,
+                                               __M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpestri (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
+{
+  return __builtin_ia32_pcmpestri128 ((__v16qi)__X, __LX,
+                                     (__v16qi)__Y, __LY,
+                                     __M);
+}
+#else
+#define _mm_cmpistrm(X, Y, M)                                          \
+  ((__m128i) __builtin_ia32_pcmpistrm128 ((__v16qi)(__m128i)(X),       \
+                                         (__v16qi)(__m128i)(Y), (int)(M)))
+#define _mm_cmpistri(X, Y, M)                                          \
+  ((int) __builtin_ia32_pcmpistri128 ((__v16qi)(__m128i)(X),           \
+                                     (__v16qi)(__m128i)(Y), (int)(M)))
+
+#define _mm_cmpestrm(X, LX, Y, LY, M)                                  \
+  ((__m128i) __builtin_ia32_pcmpestrm128 ((__v16qi)(__m128i)(X),       \
+                                         (int)(LX), (__v16qi)(__m128i)(Y), \
+                                         (int)(LY), (int)(M)))
+#define _mm_cmpestri(X, LX, Y, LY, M)                                  \
+  ((int) __builtin_ia32_pcmpestri128 ((__v16qi)(__m128i)(X), (int)(LX),        \
+                                     (__v16qi)(__m128i)(Y), (int)(LY), \
+                                     (int)(M)))
+#endif
+
+/* Intrinsics for text/string processing and reading values of
+   EFlags.  */
+
+#ifdef __OPTIMIZE__
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpistra (__m128i __X, __m128i __Y, const int __M)
+{
+  return __builtin_ia32_pcmpistria128 ((__v16qi)__X,
+                                      (__v16qi)__Y,
+                                      __M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpistrc (__m128i __X, __m128i __Y, const int __M)
+{
+  return __builtin_ia32_pcmpistric128 ((__v16qi)__X,
+                                      (__v16qi)__Y,
+                                      __M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpistro (__m128i __X, __m128i __Y, const int __M)
+{
+  return __builtin_ia32_pcmpistrio128 ((__v16qi)__X,
+                                      (__v16qi)__Y,
+                                      __M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpistrs (__m128i __X, __m128i __Y, const int __M)
+{
+  return __builtin_ia32_pcmpistris128 ((__v16qi)__X,
+                                      (__v16qi)__Y,
+                                      __M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpistrz (__m128i __X, __m128i __Y, const int __M)
+{
+  return __builtin_ia32_pcmpistriz128 ((__v16qi)__X,
+                                      (__v16qi)__Y,
+                                      __M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpestra (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
+{
+  return __builtin_ia32_pcmpestria128 ((__v16qi)__X, __LX,
+                                      (__v16qi)__Y, __LY,
+                                      __M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpestrc (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
+{
+  return __builtin_ia32_pcmpestric128 ((__v16qi)__X, __LX,
+                                      (__v16qi)__Y, __LY,
+                                      __M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpestro (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
+{
+  return __builtin_ia32_pcmpestrio128 ((__v16qi)__X, __LX,
+                                      (__v16qi)__Y, __LY,
+                                      __M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpestrs (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
+{
+  return __builtin_ia32_pcmpestris128 ((__v16qi)__X, __LX,
+                                      (__v16qi)__Y, __LY,
+                                      __M);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpestrz (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M)
+{
+  return __builtin_ia32_pcmpestriz128 ((__v16qi)__X, __LX,
+                                      (__v16qi)__Y, __LY,
+                                      __M);
+}
+#else
+#define _mm_cmpistra(X, Y, M)                                          \
+  ((int) __builtin_ia32_pcmpistria128 ((__v16qi)(__m128i)(X),          \
+                                      (__v16qi)(__m128i)(Y), (int)(M)))
+#define _mm_cmpistrc(X, Y, M)                                          \
+  ((int) __builtin_ia32_pcmpistric128 ((__v16qi)(__m128i)(X),          \
+                                      (__v16qi)(__m128i)(Y), (int)(M)))
+#define _mm_cmpistro(X, Y, M)                                          \
+  ((int) __builtin_ia32_pcmpistrio128 ((__v16qi)(__m128i)(X),          \
+                                      (__v16qi)(__m128i)(Y), (int)(M)))
+#define _mm_cmpistrs(X, Y, M)                                          \
+  ((int) __builtin_ia32_pcmpistris128 ((__v16qi)(__m128i)(X),          \
+                                      (__v16qi)(__m128i)(Y), (int)(M)))
+#define _mm_cmpistrz(X, Y, M)                                          \
+  ((int) __builtin_ia32_pcmpistriz128 ((__v16qi)(__m128i)(X),          \
+                                      (__v16qi)(__m128i)(Y), (int)(M)))
+
+#define _mm_cmpestra(X, LX, Y, LY, M)                                  \
+  ((int) __builtin_ia32_pcmpestria128 ((__v16qi)(__m128i)(X), (int)(LX), \
+                                      (__v16qi)(__m128i)(Y), (int)(LY), \
+                                      (int)(M)))
+#define _mm_cmpestrc(X, LX, Y, LY, M)                                  \
+  ((int) __builtin_ia32_pcmpestric128 ((__v16qi)(__m128i)(X), (int)(LX), \
+                                      (__v16qi)(__m128i)(Y), (int)(LY), \
+                                      (int)(M)))
+#define _mm_cmpestro(X, LX, Y, LY, M)                                  \
+  ((int) __builtin_ia32_pcmpestrio128 ((__v16qi)(__m128i)(X), (int)(LX), \
+                                      (__v16qi)(__m128i)(Y), (int)(LY), \
+                                      (int)(M)))
+#define _mm_cmpestrs(X, LX, Y, LY, M)                                  \
+  ((int) __builtin_ia32_pcmpestris128 ((__v16qi)(__m128i)(X), (int)(LX), \
+                                      (__v16qi)(__m128i)(Y), (int)(LY), \
+                                      (int)(M)))
+#define _mm_cmpestrz(X, LX, Y, LY, M)                                  \
+  ((int) __builtin_ia32_pcmpestriz128 ((__v16qi)(__m128i)(X), (int)(LX), \
+                                      (__v16qi)(__m128i)(Y), (int)(LY), \
+                                      (int)(M)))
+#endif
+
+/* Packed integer 64-bit comparison, zeroing or filling with ones
+   corresponding parts of result.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi64 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) ((__v2di)__X > (__v2di)__Y);
+}
+
+#ifdef __DISABLE_SSE4_2__
+#undef __DISABLE_SSE4_2__
+#pragma GCC pop_options
+#endif /* __DISABLE_SSE4_2__ */
+
+#ifdef __DISABLE_SSE4_1__
+#undef __DISABLE_SSE4_1__
+#pragma GCC pop_options
+#endif /* __DISABLE_SSE4_1__ */
+
+#include <popcntintrin.h>
+
+#ifndef __CRC32__
+#pragma GCC push_options
+#pragma GCC target("crc32")
+#define __DISABLE_CRC32__
+#endif /* __CRC32__ */
+
+/* Accumulate CRC32 (polynomial 0x11EDC6F41) value.  */
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_crc32_u8 (unsigned int __C, unsigned char __V)
+{
+  return __builtin_ia32_crc32qi (__C, __V);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_crc32_u16 (unsigned int __C, unsigned short __V)
+{
+  return __builtin_ia32_crc32hi (__C, __V);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_crc32_u32 (unsigned int __C, unsigned int __V)
+{
+  return __builtin_ia32_crc32si (__C, __V);
+}
+
+#ifdef __x86_64__
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_crc32_u64 (unsigned long long __C, unsigned long long __V)
+{
+  return __builtin_ia32_crc32di (__C, __V);
+}
+#endif
+
+#ifdef __DISABLE_CRC32__
+#undef __DISABLE_CRC32__
+#pragma GCC pop_options
+#endif /* __DISABLE_CRC32__ */
+
+#endif /* _SMMINTRIN_H_INCLUDED */
diff --git a/include-gcc/tbmintrin.h b/include-gcc/tbmintrin.h
new file mode 100644 (file)
index 0000000..9227f9a
--- /dev/null
@@ -0,0 +1,180 @@
+/* Copyright (C) 2010-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <tbmintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _TBMINTRIN_H_INCLUDED
+#define _TBMINTRIN_H_INCLUDED
+
+#ifndef __TBM__
+#pragma GCC push_options
+#pragma GCC target("tbm")
+#define __DISABLE_TBM__
+#endif /* __TBM__ */
+
+#ifdef __OPTIMIZE__
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bextri_u32 (unsigned int __X, const unsigned int __I)
+{
+  return __builtin_ia32_bextri_u32 (__X, __I);
+}
+#else
+#define __bextri_u32(X, I)                                             \
+  ((unsigned int)__builtin_ia32_bextri_u32 ((unsigned int)(X),         \
+                                           (unsigned int)(I)))
+#endif /*__OPTIMIZE__ */
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blcfill_u32 (unsigned int __X)
+{
+  return __X & (__X + 1);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blci_u32 (unsigned int __X)
+{
+  return __X | ~(__X + 1);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blcic_u32 (unsigned int __X)
+{
+  return ~__X & (__X + 1);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blcmsk_u32 (unsigned int __X)
+{
+  return __X ^ (__X + 1);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blcs_u32 (unsigned int __X)
+{
+  return __X | (__X + 1);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsfill_u32 (unsigned int __X)
+{
+  return __X | (__X - 1);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsic_u32 (unsigned int __X)
+{
+  return ~__X | (__X - 1);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__t1mskc_u32 (unsigned int __X)
+{
+  return ~__X | (__X + 1);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__tzmsk_u32 (unsigned int __X)
+{
+  return ~__X & (__X - 1);
+}
+
+
+
+#ifdef __x86_64__
+#ifdef __OPTIMIZE__
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bextri_u64 (unsigned long long __X, const unsigned int __I)
+{
+  return __builtin_ia32_bextri_u64 (__X, __I);
+}
+#else
+#define __bextri_u64(X, I)                                                \
+  ((unsigned long long)__builtin_ia32_bextri_u64 ((unsigned long long)(X), \
+                                                 (unsigned long long)(I)))
+#endif /*__OPTIMIZE__ */
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blcfill_u64 (unsigned long long __X)
+{
+  return __X & (__X + 1);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blci_u64 (unsigned long long __X)
+{
+  return __X | ~(__X + 1);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blcic_u64 (unsigned long long __X)
+{
+  return ~__X & (__X + 1);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blcmsk_u64 (unsigned long long __X)
+{
+  return __X ^ (__X + 1);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blcs_u64 (unsigned long long __X)
+{
+  return __X | (__X + 1);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsfill_u64 (unsigned long long __X)
+{
+  return __X | (__X - 1);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsic_u64 (unsigned long long __X)
+{
+  return ~__X | (__X - 1);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__t1mskc_u64 (unsigned long long __X)
+{
+  return ~__X | (__X + 1);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__tzmsk_u64 (unsigned long long __X)
+{
+  return ~__X & (__X - 1);
+}
+
+
+#endif /* __x86_64__  */
+
+#ifdef __DISABLE_TBM__
+#undef __DISABLE_TBM__
+#pragma GCC pop_options
+#endif /* __DISABLE_TBM__ */
+
+#endif /* _TBMINTRIN_H_INCLUDED */
diff --git a/include-gcc/tmmintrin.h b/include-gcc/tmmintrin.h
new file mode 100644 (file)
index 0000000..2df29a9
--- /dev/null
@@ -0,0 +1,249 @@
+/* Copyright (C) 2006-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+   User Guide and Reference, version 9.1.  */
+
+#ifndef _TMMINTRIN_H_INCLUDED
+#define _TMMINTRIN_H_INCLUDED
+
+/* We need definitions from the SSE3, SSE2 and SSE header files*/
+#include <pmmintrin.h>
+
+#ifndef __SSSE3__
+#pragma GCC push_options
+#pragma GCC target("ssse3")
+#define __DISABLE_SSSE3__
+#endif /* __SSSE3__ */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_epi16 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_phaddw128 ((__v8hi)__X, (__v8hi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_phaddd128 ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadds_epi16 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_phaddsw128 ((__v8hi)__X, (__v8hi)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_pi16 (__m64 __X, __m64 __Y)
+{
+  return (__m64) __builtin_ia32_phaddw ((__v4hi)__X, (__v4hi)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_pi32 (__m64 __X, __m64 __Y)
+{
+  return (__m64) __builtin_ia32_phaddd ((__v2si)__X, (__v2si)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadds_pi16 (__m64 __X, __m64 __Y)
+{
+  return (__m64) __builtin_ia32_phaddsw ((__v4hi)__X, (__v4hi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_epi16 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_phsubw128 ((__v8hi)__X, (__v8hi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_phsubd128 ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsubs_epi16 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_phsubsw128 ((__v8hi)__X, (__v8hi)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_pi16 (__m64 __X, __m64 __Y)
+{
+  return (__m64) __builtin_ia32_phsubw ((__v4hi)__X, (__v4hi)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_pi32 (__m64 __X, __m64 __Y)
+{
+  return (__m64) __builtin_ia32_phsubd ((__v2si)__X, (__v2si)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsubs_pi16 (__m64 __X, __m64 __Y)
+{
+  return (__m64) __builtin_ia32_phsubsw ((__v4hi)__X, (__v4hi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maddubs_epi16 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pmaddubsw128 ((__v16qi)__X, (__v16qi)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maddubs_pi16 (__m64 __X, __m64 __Y)
+{
+  return (__m64) __builtin_ia32_pmaddubsw ((__v8qi)__X, (__v8qi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhrs_epi16 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pmulhrsw128 ((__v8hi)__X, (__v8hi)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhrs_pi16 (__m64 __X, __m64 __Y)
+{
+  return (__m64) __builtin_ia32_pmulhrsw ((__v4hi)__X, (__v4hi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_epi8 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_pshufb128 ((__v16qi)__X, (__v16qi)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_pi8 (__m64 __X, __m64 __Y)
+{
+  return (__m64) __builtin_ia32_pshufb ((__v8qi)__X, (__v8qi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_epi8 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psignb128 ((__v16qi)__X, (__v16qi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_epi16 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psignw128 ((__v8hi)__X, (__v8hi)__Y);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_epi32 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_psignd128 ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_pi8 (__m64 __X, __m64 __Y)
+{
+  return (__m64) __builtin_ia32_psignb ((__v8qi)__X, (__v8qi)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_pi16 (__m64 __X, __m64 __Y)
+{
+  return (__m64) __builtin_ia32_psignw ((__v4hi)__X, (__v4hi)__Y);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_pi32 (__m64 __X, __m64 __Y)
+{
+  return (__m64) __builtin_ia32_psignd ((__v2si)__X, (__v2si)__Y);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N)
+{
+  return (__m128i) __builtin_ia32_palignr128 ((__v2di)__X,
+                                             (__v2di)__Y, __N * 8);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_alignr_pi8(__m64 __X, __m64 __Y, const int __N)
+{
+  return (__m64) __builtin_ia32_palignr ((__v1di)__X,
+                                        (__v1di)__Y, __N * 8);
+}
+#else
+#define _mm_alignr_epi8(X, Y, N)                                       \
+  ((__m128i) __builtin_ia32_palignr128 ((__v2di)(__m128i)(X),          \
+                                       (__v2di)(__m128i)(Y),           \
+                                       (int)(N) * 8))
+#define _mm_alignr_pi8(X, Y, N)                                                \
+  ((__m64) __builtin_ia32_palignr ((__v1di)(__m64)(X),                 \
+                                  (__v1di)(__m64)(Y),                  \
+                                  (int)(N) * 8))
+#endif
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_epi8 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pabsb128 ((__v16qi)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_epi16 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pabsw128 ((__v8hi)__X);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_epi32 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_pabsd128 ((__v4si)__X);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_pi8 (__m64 __X)
+{
+  return (__m64) __builtin_ia32_pabsb ((__v8qi)__X);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_pi16 (__m64 __X)
+{
+  return (__m64) __builtin_ia32_pabsw ((__v4hi)__X);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_pi32 (__m64 __X)
+{
+  return (__m64) __builtin_ia32_pabsd ((__v2si)__X);
+}
+
+#ifdef __DISABLE_SSSE3__
+#undef __DISABLE_SSSE3__
+#pragma GCC pop_options
+#endif /* __DISABLE_SSSE3__ */
+
+#endif /* _TMMINTRIN_H_INCLUDED */
diff --git a/include-gcc/tsxldtrkintrin.h b/include-gcc/tsxldtrkintrin.h
new file mode 100644 (file)
index 0000000..c3dce59
--- /dev/null
@@ -0,0 +1,56 @@
+/* Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <tsxldtrkintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _TSXLDTRKINTRIN_H_INCLUDED
+#define _TSXLDTRKINTRIN_H_INCLUDED
+
+#if !defined(__TSXLDTRK__)
+#pragma GCC push_options
+#pragma GCC target("tsxldtrk")
+#define __DISABLE_TSXLDTRK__
+#endif /* __TSXLDTRK__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xsusldtrk (void)
+{
+  __builtin_ia32_xsusldtrk ();
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xresldtrk (void)
+{
+  __builtin_ia32_xresldtrk ();
+}
+
+#ifdef __DISABLE_TSXLDTRK__
+#undef __DISABLE_TSXLDTRK__
+#pragma GCC pop_options
+#endif /* __DISABLE_TSXLDTRK__ */
+
+#endif /* _TSXLDTRKINTRIN_H_INCLUDED */
diff --git a/include-gcc/uintrintrin.h b/include-gcc/uintrintrin.h
new file mode 100644 (file)
index 0000000..dc5dab3
--- /dev/null
@@ -0,0 +1,84 @@
+/* Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <uintrintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _UINTRNTRIN_H_INCLUDED
+#define _UINTRNTRIN_H_INCLUDED
+
+#ifdef __x86_64__
+
+#ifndef __UINTR__
+#pragma GCC push_options
+#pragma GCC target ("uintr")
+#define __DISABLE_UINTR__
+#endif /* __UINTR__ */
+
+struct __uintr_frame
+{
+  /* RIP of the interrupted user process.  */
+  unsigned long long rip;
+  /* RFLAGS of the interrupted user process.  */
+  unsigned long long rflags;
+  /* RSP of the interrupted user process.  */
+  unsigned long long rsp;
+};
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_clui (void)
+{
+  __builtin_ia32_clui ();
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_stui (void)
+{
+  __builtin_ia32_stui ();
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_senduipi (unsigned long long __R)
+{
+  __builtin_ia32_senduipi (__R);
+}
+
+extern __inline unsigned char
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_testui (void)
+{
+  return __builtin_ia32_testui ();
+}
+
+#ifdef __DISABLE_UINTR__
+#undef __DISABLE_UINTR__
+#pragma GCC pop_options
+#endif /* __DISABLE_UINTR__ */
+
+#endif
+
+#endif /* _UINTRNTRIN_H_INCLUDED.  */
diff --git a/include-gcc/vaesintrin.h b/include-gcc/vaesintrin.h
new file mode 100644 (file)
index 0000000..0f1cffe
--- /dev/null
@@ -0,0 +1,111 @@
+/* Copyright (C) 2017-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef __VAESINTRIN_H_INCLUDED
+#define __VAESINTRIN_H_INCLUDED
+
+#if !defined(__VAES__) || !defined(__AVX__)
+#pragma GCC push_options
+#pragma GCC target("vaes,avx")
+#define __DISABLE_VAES__
+#endif /* __VAES__ */
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_aesdec_epi128 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_vaesdec_v32qi ((__v32qi) __A, (__v32qi) __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_aesdeclast_epi128 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_vaesdeclast_v32qi ((__v32qi) __A,
+                                                               (__v32qi) __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_aesenc_epi128 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_vaesenc_v32qi ((__v32qi) __A, (__v32qi) __B);
+}
+
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_aesenclast_epi128 (__m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_vaesenclast_v32qi ((__v32qi) __A,
+                                                               (__v32qi) __B);
+}
+
+#ifdef __DISABLE_VAES__
+#undef __DISABLE_VAES__
+#pragma GCC pop_options
+#endif /* __DISABLE_VAES__ */
+
+
+#if !defined(__VAES__) || !defined(__AVX512F__)
+#pragma GCC push_options
+#pragma GCC target("vaes,avx512f")
+#define __DISABLE_VAESF__
+#endif /* __VAES__ */
+
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_aesdec_epi128 (__m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_vaesdec_v64qi ((__v64qi) __A, (__v64qi) __B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_aesdeclast_epi128 (__m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_vaesdeclast_v64qi ((__v64qi) __A,
+                                                   (__v64qi) __B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_aesenc_epi128 (__m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_vaesenc_v64qi ((__v64qi) __A, (__v64qi) __B);
+}
+
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_aesenclast_epi128 (__m512i __A, __m512i __B)
+{
+  return (__m512i)__builtin_ia32_vaesenclast_v64qi ((__v64qi) __A,
+                                                   (__v64qi) __B);
+}
+
+#ifdef __DISABLE_VAESF__
+#undef __DISABLE_VAESF__
+#pragma GCC pop_options
+#endif /* __DISABLE_VAES__ */
+
+#endif /* __VAESINTRIN_H_INCLUDED */
diff --git a/include-gcc/vpclmulqdqintrin.h b/include-gcc/vpclmulqdqintrin.h
new file mode 100644 (file)
index 0000000..ba93fc4
--- /dev/null
@@ -0,0 +1,81 @@
+/* Copyright (C) 2014-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _IMMINTRIN_H_INCLUDED
+#error "Never use <vpclmulqdqintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef _VPCLMULQDQINTRIN_H_INCLUDED
+#define _VPCLMULQDQINTRIN_H_INCLUDED
+
+#if !defined(__VPCLMULQDQ__) || !defined(__AVX512F__)
+#pragma GCC push_options
+#pragma GCC target("vpclmulqdq,avx512f")
+#define __DISABLE_VPCLMULQDQF__
+#endif /* __VPCLMULQDQF__ */
+
+#ifdef __OPTIMIZE__
+extern __inline __m512i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm512_clmulepi64_epi128 (__m512i __A, __m512i __B, const int __C)
+{
+  return (__m512i) __builtin_ia32_vpclmulqdq_v8di ((__v8di)__A,
+                                                 (__v8di) __B, __C);
+}
+#else
+#define _mm512_clmulepi64_epi128(A, B, C)                                 \
+  ((__m512i) __builtin_ia32_vpclmulqdq_v8di ((__v8di)(__m512i)(A),     \
+                               (__v8di)(__m512i)(B), (int)(C)))
+#endif
+
+#ifdef __DISABLE_VPCLMULQDQF__
+#undef __DISABLE_VPCLMULQDQF__
+#pragma GCC pop_options
+#endif /* __DISABLE_VPCLMULQDQF__ */
+
+#if !defined(__VPCLMULQDQ__) || !defined(__AVX__)
+#pragma GCC push_options
+#pragma GCC target("vpclmulqdq,avx")
+#define __DISABLE_VPCLMULQDQ__
+#endif /* __VPCLMULQDQ__ */
+
+#ifdef __OPTIMIZE__
+extern __inline __m256i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_clmulepi64_epi128 (__m256i __A, __m256i __B, const int __C)
+{
+  return (__m256i) __builtin_ia32_vpclmulqdq_v4di ((__v4di)__A,
+                                                  (__v4di) __B, __C);
+}
+#else
+#define _mm256_clmulepi64_epi128(A, B, C)                         \
+  ((__m256i) __builtin_ia32_vpclmulqdq_v4di ((__v4di)(__m256i)(A), \
+                               (__v4di)(__m256i)(B), (int)(C)))
+#endif
+
+#ifdef __DISABLE_VPCLMULQDQ__
+#undef __DISABLE_VPCLMULQDQ__
+#pragma GCC pop_options
+#endif /* __DISABLE_VPCLMULQDQ__ */
+
+#endif /* _VPCLMULQDQINTRIN_H_INCLUDED */
diff --git a/include-gcc/waitpkgintrin.h b/include-gcc/waitpkgintrin.h
new file mode 100644 (file)
index 0000000..9d2f23a
--- /dev/null
@@ -0,0 +1,63 @@
+/* Copyright (C) 2018-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <waitpkgintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _WAITPKG_H_INCLUDED
+#define _WAITPKG_H_INCLUDED
+
+#ifndef __WAITPKG__
+#pragma GCC push_options
+#pragma GCC target("waitpkg")
+#define __DISABLE_WAITPKG__
+#endif /* __WAITPKG__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_umonitor (void *__A)
+{
+  __builtin_ia32_umonitor (__A);
+}
+
+extern __inline unsigned char
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_umwait (unsigned int __A, unsigned long long __B)
+{
+  return __builtin_ia32_umwait (__A, __B);
+}
+
+extern __inline unsigned char
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_tpause (unsigned int __A, unsigned long long __B)
+{
+  return __builtin_ia32_tpause (__A, __B);
+}
+
+#ifdef __DISABLE_WAITPKG__
+#undef __DISABLE_WAITPKG__
+#pragma GCC pop_options
+#endif /* __DISABLE_WAITPKG__ */
+
+#endif /* _WAITPKG_H_INCLUDED.  */
diff --git a/include-gcc/wbnoinvdintrin.h b/include-gcc/wbnoinvdintrin.h
new file mode 100644 (file)
index 0000000..5d1e0ab
--- /dev/null
@@ -0,0 +1,49 @@
+/* Copyright (C) 2018-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <wbnoinvdintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _WBNOINVDINTRIN_H_INCLUDED
+#define _WBNOINVDINTRIN_H_INCLUDED
+
+#ifndef __WBNOINVD__
+#pragma GCC push_options
+#pragma GCC target("wbnoinvd")
+#define __DISABLE_WBNOINVD__
+#endif /* __WBNOINVD__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_wbnoinvd (void)
+{
+  __builtin_ia32_wbnoinvd ();
+}
+
+#ifdef __DISABLE_WBNOINVD__
+#undef __DISABLE_WBNOINVD__
+#pragma GCC pop_options
+#endif /* __DISABLE_WBNOINVD__ */
+
+#endif /* _WBNOINVDINTRIN_H_INCLUDED */
diff --git a/include-gcc/wmmintrin.h b/include-gcc/wmmintrin.h
new file mode 100644 (file)
index 0000000..ae15cea
--- /dev/null
@@ -0,0 +1,132 @@
+/* Copyright (C) 2008-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+   User Guide and Reference, version 10.1.  */
+
+#ifndef _WMMINTRIN_H_INCLUDED
+#define _WMMINTRIN_H_INCLUDED
+
+/* We need definitions from the SSE2 header file.  */
+#include <emmintrin.h>
+
+/* AES */
+
+#if !defined(__AES__) || !defined(__SSE2__)
+#pragma GCC push_options
+#pragma GCC target("aes,sse2")
+#define __DISABLE_AES__
+#endif /* __AES__ */
+
+/* Performs 1 round of AES decryption of the first m128i using 
+   the second m128i as a round key.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesdec_si128 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_aesdec128 ((__v2di)__X, (__v2di)__Y);
+}
+
+/* Performs the last round of AES decryption of the first m128i 
+   using the second m128i as a round key.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesdeclast_si128 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_aesdeclast128 ((__v2di)__X,
+                                                (__v2di)__Y);
+}
+
+/* Performs 1 round of AES encryption of the first m128i using 
+   the second m128i as a round key.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesenc_si128 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_aesenc128 ((__v2di)__X, (__v2di)__Y);
+}
+
+/* Performs the last round of AES encryption of the first m128i
+   using the second m128i as a round key.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesenclast_si128 (__m128i __X, __m128i __Y)
+{
+  return (__m128i) __builtin_ia32_aesenclast128 ((__v2di)__X, (__v2di)__Y);
+}
+
+/* Performs the InverseMixColumn operation on the source m128i 
+   and stores the result into m128i destination.  */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aesimc_si128 (__m128i __X)
+{
+  return (__m128i) __builtin_ia32_aesimc128 ((__v2di)__X);
+}
+
+/* Generates a m128i round key for the input m128i AES cipher key and
+   byte round constant.  The second parameter must be a compile time
+   constant.  */
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_aeskeygenassist_si128 (__m128i __X, const int __C)
+{
+  return (__m128i) __builtin_ia32_aeskeygenassist128 ((__v2di)__X, __C);
+}
+#else
+#define _mm_aeskeygenassist_si128(X, C)                                        \
+  ((__m128i) __builtin_ia32_aeskeygenassist128 ((__v2di)(__m128i)(X),  \
+                                               (int)(C)))
+#endif
+
+#ifdef __DISABLE_AES__
+#undef __DISABLE_AES__
+#pragma GCC pop_options
+#endif /* __DISABLE_AES__ */
+
+/* PCLMUL */
+
+#if !defined(__PCLMUL__) || !defined(__SSE2__)
+#pragma GCC push_options
+#pragma GCC target("pclmul,sse2")
+#define __DISABLE_PCLMUL__
+#endif /* __PCLMUL__ */
+
+/* Performs carry-less integer multiplication of 64-bit halves of
+   128-bit input operands.  The third parameter inducates which 64-bit
+   haves of the input parameters v1 and v2 should be used. It must be
+   a compile time constant.  */
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_clmulepi64_si128 (__m128i __X, __m128i __Y, const int __I)
+{
+  return (__m128i) __builtin_ia32_pclmulqdq128 ((__v2di)__X,
+                                               (__v2di)__Y, __I);
+}
+#else
+#define _mm_clmulepi64_si128(X, Y, I)                                  \
+  ((__m128i) __builtin_ia32_pclmulqdq128 ((__v2di)(__m128i)(X),                \
+                                         (__v2di)(__m128i)(Y), (int)(I)))
+#endif
+
+#ifdef __DISABLE_PCLMUL__
+#undef __DISABLE_PCLMUL__
+#pragma GCC pop_options
+#endif /* __DISABLE_PCLMUL__ */
+
+#endif /* _WMMINTRIN_H_INCLUDED */
diff --git a/include-gcc/x86gprintrin.h b/include-gcc/x86gprintrin.h
new file mode 100644 (file)
index 0000000..f41be3f
--- /dev/null
@@ -0,0 +1,275 @@
+/* Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+#define _X86GPRINTRIN_H_INCLUDED
+
+#if !defined _SOFT_FLOAT || defined __MMX__ || defined __SSE__
+#pragma GCC push_options
+#pragma GCC target("general-regs-only")
+#define __DISABLE_GENERAL_REGS_ONLY__
+#endif
+
+#include <ia32intrin.h>
+
+#ifndef __iamcu__
+
+#include <stddef.h>
+
+#include <adxintrin.h>
+
+#include <bmiintrin.h>
+
+#include <bmi2intrin.h>
+
+#include <cetintrin.h>
+
+#include <cldemoteintrin.h>
+
+#include <clflushoptintrin.h>
+
+#include <clwbintrin.h>
+
+#include <clzerointrin.h>
+
+#include <cmpccxaddintrin.h>
+
+#include <enqcmdintrin.h>
+
+#include <fxsrintrin.h>
+
+#include <lzcntintrin.h>
+
+#include <lwpintrin.h>
+
+#include <movdirintrin.h>
+
+#include <mwaitintrin.h>
+
+#include <mwaitxintrin.h>
+
+#include <pconfigintrin.h>
+
+#include <popcntintrin.h>
+
+#include <pkuintrin.h>
+
+#include <prfchiintrin.h>
+
+#include <raointintrin.h>
+
+#include <rdseedintrin.h>
+
+#include <rtmintrin.h>
+
+#include <serializeintrin.h>
+
+#include <sgxintrin.h>
+
+#include <tbmintrin.h>
+
+#include <tsxldtrkintrin.h>
+
+#include <uintrintrin.h>
+
+#include <waitpkgintrin.h>
+
+#include <wbnoinvdintrin.h>
+
+#include <xsaveintrin.h>
+
+#include <xsavecintrin.h>
+
+#include <xsaveoptintrin.h>
+
+#include <xsavesintrin.h>
+
+#include <xtestintrin.h>
+
+#include <hresetintrin.h>
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_wbinvd (void)
+{
+  __builtin_ia32_wbinvd ();
+}
+
+#ifndef __RDRND__
+#pragma GCC push_options
+#pragma GCC target("rdrnd")
+#define __DISABLE_RDRND__
+#endif /* __RDRND__ */
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_rdrand16_step (unsigned short *__P)
+{
+  return __builtin_ia32_rdrand16_step (__P);
+}
+
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_rdrand32_step (unsigned int *__P)
+{
+  return __builtin_ia32_rdrand32_step (__P);
+}
+#ifdef __DISABLE_RDRND__
+#undef __DISABLE_RDRND__
+#pragma GCC pop_options
+#endif /* __DISABLE_RDRND__ */
+
+#ifndef __RDPID__
+#pragma GCC push_options
+#pragma GCC target("rdpid")
+#define __DISABLE_RDPID__
+#endif /* __RDPID__ */
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_rdpid_u32 (void)
+{
+  return __builtin_ia32_rdpid ();
+}
+#ifdef __DISABLE_RDPID__
+#undef __DISABLE_RDPID__
+#pragma GCC pop_options
+#endif /* __DISABLE_RDPID__ */
+
+#ifdef  __x86_64__
+
+#ifndef __FSGSBASE__
+#pragma GCC push_options
+#pragma GCC target("fsgsbase")
+#define __DISABLE_FSGSBASE__
+#endif /* __FSGSBASE__ */
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_readfsbase_u32 (void)
+{
+  return __builtin_ia32_rdfsbase32 ();
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_readfsbase_u64 (void)
+{
+  return __builtin_ia32_rdfsbase64 ();
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_readgsbase_u32 (void)
+{
+  return __builtin_ia32_rdgsbase32 ();
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_readgsbase_u64 (void)
+{
+  return __builtin_ia32_rdgsbase64 ();
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_writefsbase_u32 (unsigned int __B)
+{
+  __builtin_ia32_wrfsbase32 (__B);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_writefsbase_u64 (unsigned long long __B)
+{
+  __builtin_ia32_wrfsbase64 (__B);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_writegsbase_u32 (unsigned int __B)
+{
+  __builtin_ia32_wrgsbase32 (__B);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_writegsbase_u64 (unsigned long long __B)
+{
+  __builtin_ia32_wrgsbase64 (__B);
+}
+#ifdef __DISABLE_FSGSBASE__
+#undef __DISABLE_FSGSBASE__
+#pragma GCC pop_options
+#endif /* __DISABLE_FSGSBASE__ */
+
+#ifndef __RDRND__
+#pragma GCC push_options
+#pragma GCC target("rdrnd")
+#define __DISABLE_RDRND__
+#endif /* __RDRND__ */
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_rdrand64_step (unsigned long long *__P)
+{
+  return __builtin_ia32_rdrand64_step (__P);
+}
+#ifdef __DISABLE_RDRND__
+#undef __DISABLE_RDRND__
+#pragma GCC pop_options
+#endif /* __DISABLE_RDRND__ */
+
+#endif /* __x86_64__  */
+
+#ifndef __PTWRITE__
+#pragma GCC push_options
+#pragma GCC target("ptwrite")
+#define __DISABLE_PTWRITE__
+#endif
+
+#ifdef __x86_64__
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_ptwrite64 (unsigned long long __B)
+{
+  __builtin_ia32_ptwrite64 (__B);
+}
+#endif /* __x86_64__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_ptwrite32 (unsigned __B)
+{
+  __builtin_ia32_ptwrite32 (__B);
+}
+#ifdef __DISABLE_PTWRITE__
+#undef __DISABLE_PTWRITE__
+#pragma GCC pop_options
+#endif /* __DISABLE_PTWRITE__ */
+
+#endif /* __iamcu__ */
+
+#ifdef __DISABLE_GENERAL_REGS_ONLY__
+#undef __DISABLE_GENERAL_REGS_ONLY__
+#pragma GCC pop_options
+#endif /* __DISABLE_GENERAL_REGS_ONLY__ */
+
+#endif /* _X86GPRINTRIN_H_INCLUDED.  */
diff --git a/include-gcc/x86intrin.h b/include-gcc/x86intrin.h
new file mode 100644 (file)
index 0000000..ac612ce
--- /dev/null
@@ -0,0 +1,42 @@
+/* Copyright (C) 2008-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86INTRIN_H_INCLUDED
+#define _X86INTRIN_H_INCLUDED
+
+#include <x86gprintrin.h>
+
+#ifndef __iamcu__
+
+/* For including AVX instructions */
+#include <immintrin.h>
+
+#include <mm3dnow.h>
+
+#include <fma4intrin.h>
+
+#include <xopintrin.h>
+
+#endif /* __iamcu__ */
+
+#endif /* _X86INTRIN_H_INCLUDED */
diff --git a/include-gcc/xmmintrin.h b/include-gcc/xmmintrin.h
new file mode 100644 (file)
index 0000000..cb518fc
--- /dev/null
@@ -0,0 +1,1340 @@
+/* Copyright (C) 2002-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+   User Guide and Reference, version 9.0.  */
+
+#ifndef _XMMINTRIN_H_INCLUDED
+#define _XMMINTRIN_H_INCLUDED
+
+/* We need type definitions from the MMX header file.  */
+#include <mmintrin.h>
+
+/* Get _mm_malloc () and _mm_free ().  */
+#include <mm_malloc.h>
+
+/* Constants for use with _mm_prefetch.  */
+enum _mm_hint
+{
+  _MM_HINT_IT0 = 19,
+  _MM_HINT_IT1 = 18,
+  /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit.  */
+  _MM_HINT_ET0 = 7,
+  _MM_HINT_ET1 = 6,
+  _MM_HINT_T0 = 3,
+  _MM_HINT_T1 = 2,
+  _MM_HINT_T2 = 1,
+  _MM_HINT_NTA = 0
+};
+
+/* Loads one cache line from address P to a location "closer" to the
+   processor.  The selector I specifies the type of prefetch operation.  */
+#ifdef __OPTIMIZE__
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_prefetch (const void *__P, enum _mm_hint __I)
+{
+  __builtin_ia32_prefetch (__P, (__I & 0x4) >> 2,
+                          __I & 0x3, (__I & 0x10) >> 4);
+}
+#else
+#define _mm_prefetch(P, I) \
+  __builtin_ia32_prefetch ((P), ((I) & 0x4) >> 2, ((I) & 0x3), ((I) & 0x10) >> 4)
+#endif
+
+#ifndef __SSE__
+#pragma GCC push_options
+#pragma GCC target("sse")
+#define __DISABLE_SSE__
+#endif /* __SSE__ */
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+   vector types, and their scalar components.  */
+typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
+
+/* Unaligned version of the same type.  */
+typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
+
+/* Internal data types for implementing the intrinsics.  */
+typedef float __v4sf __attribute__ ((__vector_size__ (16)));
+
+/* Create a selector for use with the SHUFPS instruction.  */
+#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
+ (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
+
+/* Bits in the MXCSR.  */
+#define _MM_EXCEPT_MASK       0x003f
+#define _MM_EXCEPT_INVALID    0x0001
+#define _MM_EXCEPT_DENORM     0x0002
+#define _MM_EXCEPT_DIV_ZERO   0x0004
+#define _MM_EXCEPT_OVERFLOW   0x0008
+#define _MM_EXCEPT_UNDERFLOW  0x0010
+#define _MM_EXCEPT_INEXACT    0x0020
+
+#define _MM_MASK_MASK         0x1f80
+#define _MM_MASK_INVALID      0x0080
+#define _MM_MASK_DENORM       0x0100
+#define _MM_MASK_DIV_ZERO     0x0200
+#define _MM_MASK_OVERFLOW     0x0400
+#define _MM_MASK_UNDERFLOW    0x0800
+#define _MM_MASK_INEXACT      0x1000
+
+#define _MM_ROUND_MASK        0x6000
+#define _MM_ROUND_NEAREST     0x0000
+#define _MM_ROUND_DOWN        0x2000
+#define _MM_ROUND_UP          0x4000
+#define _MM_ROUND_TOWARD_ZERO 0x6000
+
+#define _MM_FLUSH_ZERO_MASK   0x8000
+#define _MM_FLUSH_ZERO_ON     0x8000
+#define _MM_FLUSH_ZERO_OFF    0x0000
+
+/* Create an undefined vector.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_undefined_ps (void)
+{
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Winit-self"
+  __m128 __Y = __Y;
+#pragma GCC diagnostic pop
+  return __Y;
+}
+
+/* Create a vector of zeros.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setzero_ps (void)
+{
+  return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
+}
+
+/* Perform the respective operation on the lower SPFP (single-precision
+   floating-point) values of A and B; the upper three SPFP values are
+   passed through from A.  */
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_ss (__m128 __A)
+{
+  return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp_ss (__m128 __A)
+{
+  return (__m128) __builtin_ia32_rcpss ((__v4sf)__A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt_ss (__m128 __A)
+{
+  return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B);
+}
+
+/* Perform the respective operation on the four SPFP values in A and B.  */
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_add_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) ((__v4sf)__A + (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sub_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) ((__v4sf)__A - (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) ((__v4sf)__A * (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_div_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) ((__v4sf)__A / (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sqrt_ps (__m128 __A)
+{
+  return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rcp_ps (__m128 __A)
+{
+  return (__m128) __builtin_ia32_rcpps ((__v4sf)__A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rsqrt_ps (__m128 __A)
+{
+  return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B);
+}
+
+/* Perform logical bit-wise operations on 128-bit values.  */
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_and_ps (__m128 __A, __m128 __B)
+{
+  return __builtin_ia32_andps (__A, __B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_andnot_ps (__m128 __A, __m128 __B)
+{
+  return __builtin_ia32_andnps (__A, __B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_or_ps (__m128 __A, __m128 __B)
+{
+  return __builtin_ia32_orps (__A, __B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_xor_ps (__m128 __A, __m128 __B)
+{
+  return __builtin_ia32_xorps (__A, __B);
+}
+
+/* Perform a comparison on the lower SPFP values of A and B.  If the
+   comparison is true, place a mask of all ones in the result, otherwise a
+   mask of zeros.  The upper three SPFP values are passed through from A.  */
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
+                                       (__v4sf)
+                                       __builtin_ia32_cmpltss ((__v4sf) __B,
+                                                               (__v4sf)
+                                                               __A));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
+                                       (__v4sf)
+                                       __builtin_ia32_cmpless ((__v4sf) __B,
+                                                               (__v4sf)
+                                                               __A));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnlt_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnle_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpngt_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
+                                       (__v4sf)
+                                       __builtin_ia32_cmpnltss ((__v4sf) __B,
+                                                                (__v4sf)
+                                                                __A));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnge_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_movss ((__v4sf) __A,
+                                       (__v4sf)
+                                       __builtin_ia32_cmpnless ((__v4sf) __B,
+                                                                (__v4sf)
+                                                                __A));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpord_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpunord_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B);
+}
+
+/* Perform a comparison on the four SPFP values of A and B.  For each
+   element, if the comparison is true, place a mask of all ones in the
+   result, otherwise a mask of zeros.  */
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmplt_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmple_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpge_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpneq_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnlt_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnle_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpngt_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpnge_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpord_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpunord_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B);
+}
+
+/* Compare the lower SPFP values of A and B and return 1 if true
+   and 0 if false.  */
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comieq_ss (__m128 __A, __m128 __B)
+{
+  return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comilt_ss (__m128 __A, __m128 __B)
+{
+  return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comile_ss (__m128 __A, __m128 __B)
+{
+  return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comigt_ss (__m128 __A, __m128 __B)
+{
+  return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comige_ss (__m128 __A, __m128 __B)
+{
+  return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comineq_ss (__m128 __A, __m128 __B)
+{
+  return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomieq_ss (__m128 __A, __m128 __B)
+{
+  return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomilt_ss (__m128 __A, __m128 __B)
+{
+  return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomile_ss (__m128 __A, __m128 __B)
+{
+  return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomigt_ss (__m128 __A, __m128 __B)
+{
+  return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomige_ss (__m128 __A, __m128 __B)
+{
+  return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_ucomineq_ss (__m128 __A, __m128 __B)
+{
+  return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B);
+}
+
+/* Convert the lower SPFP value to a 32-bit integer according to the current
+   rounding mode.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_si32 (__m128 __A)
+{
+  return __builtin_ia32_cvtss2si ((__v4sf) __A);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_ss2si (__m128 __A)
+{
+  return _mm_cvtss_si32 (__A);
+}
+
+#ifdef __x86_64__
+/* Convert the lower SPFP value to a 32-bit integer according to the
+   current rounding mode.  */
+
+/* Intel intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_si64 (__m128 __A)
+{
+  return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
+}
+
+/* Microsoft intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_si64x (__m128 __A)
+{
+  return __builtin_ia32_cvtss2si64 ((__v4sf) __A);
+}
+#endif
+
+/* Convert the two lower SPFP values to 32-bit integers according to the
+   current rounding mode.  Return the integers in packed form.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_pi32 (__m128 __A)
+{
+  return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_ps2pi (__m128 __A)
+{
+  return _mm_cvtps_pi32 (__A);
+}
+
+/* Truncate the lower SPFP value to a 32-bit integer.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttss_si32 (__m128 __A)
+{
+  return __builtin_ia32_cvttss2si ((__v4sf) __A);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_ss2si (__m128 __A)
+{
+  return _mm_cvttss_si32 (__A);
+}
+
+#ifdef __x86_64__
+/* Truncate the lower SPFP value to a 32-bit integer.  */
+
+/* Intel intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttss_si64 (__m128 __A)
+{
+  return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
+}
+
+/* Microsoft intrinsic.  */
+extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttss_si64x (__m128 __A)
+{
+  return __builtin_ia32_cvttss2si64 ((__v4sf) __A);
+}
+#endif
+
+/* Truncate the two lower SPFP values to 32-bit integers.  Return the
+   integers in packed form.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvttps_pi32 (__m128 __A)
+{
+  return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtt_ps2pi (__m128 __A)
+{
+  return _mm_cvttps_pi32 (__A);
+}
+
+/* Convert B to a SPFP value and insert it as element zero in A.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi32_ss (__m128 __A, int __B)
+{
+  return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_si2ss (__m128 __A, int __B)
+{
+  return _mm_cvtsi32_ss (__A, __B);
+}
+
+#ifdef __x86_64__
+/* Convert B to a SPFP value and insert it as element zero in A.  */
+
+/* Intel intrinsic.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64_ss (__m128 __A, long long __B)
+{
+  return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
+}
+
+/* Microsoft intrinsic.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtsi64x_ss (__m128 __A, long long __B)
+{
+  return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B);
+}
+#endif
+
+/* Convert the two 32-bit values in B to SPFP form and insert them
+   as the two lower elements in A.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpi32_ps (__m128 __A, __m64 __B)
+{
+  return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvt_pi2ps (__m128 __A, __m64 __B)
+{
+  return _mm_cvtpi32_ps (__A, __B);
+}
+
+/* Convert the four signed 16-bit values in A to SPFP form.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpi16_ps (__m64 __A)
+{
+  __v4hi __sign;
+  __v2si __hisi, __losi;
+  __v4sf __zero, __ra, __rb;
+
+  /* This comparison against zero gives us a mask that can be used to
+     fill in the missing sign bits in the unpack operations below, so
+     that we get signed values after unpacking.  */
+  __sign = __builtin_ia32_pcmpgtw ((__v4hi)0LL, (__v4hi)__A);
+
+  /* Convert the four words to doublewords.  */
+  __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign);
+  __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign);
+
+  /* Convert the doublewords to floating point two at a time.  */
+  __zero = (__v4sf) _mm_setzero_ps ();
+  __ra = __builtin_ia32_cvtpi2ps (__zero, __losi);
+  __rb = __builtin_ia32_cvtpi2ps (__ra, __hisi);
+
+  return (__m128) __builtin_ia32_movlhps (__ra, __rb);
+}
+
+/* Convert the four unsigned 16-bit values in A to SPFP form.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpu16_ps (__m64 __A)
+{
+  __v2si __hisi, __losi;
+  __v4sf __zero, __ra, __rb;
+
+  /* Convert the four words to doublewords.  */
+  __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, (__v4hi)0LL);
+  __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, (__v4hi)0LL);
+
+  /* Convert the doublewords to floating point two at a time.  */
+  __zero = (__v4sf) _mm_setzero_ps ();
+  __ra = __builtin_ia32_cvtpi2ps (__zero, __losi);
+  __rb = __builtin_ia32_cvtpi2ps (__ra, __hisi);
+
+  return (__m128) __builtin_ia32_movlhps (__ra, __rb);
+}
+
+/* Convert the low four signed 8-bit values in A to SPFP form.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpi8_ps (__m64 __A)
+{
+  __v8qi __sign;
+
+  /* This comparison against zero gives us a mask that can be used to
+     fill in the missing sign bits in the unpack operations below, so
+     that we get signed values after unpacking.  */
+  __sign = __builtin_ia32_pcmpgtb ((__v8qi)0LL, (__v8qi)__A);
+
+  /* Convert the four low bytes to words.  */
+  __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign);
+
+  return _mm_cvtpi16_ps(__A);
+}
+
+/* Convert the low four unsigned 8-bit values in A to SPFP form.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpu8_ps(__m64 __A)
+{
+  __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, (__v8qi)0LL);
+  return _mm_cvtpu16_ps(__A);
+}
+
+/* Convert the four signed 32-bit values in A and B to SPFP form.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtpi32x2_ps(__m64 __A, __m64 __B)
+{
+  __v4sf __zero = (__v4sf) _mm_setzero_ps ();
+  __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A);
+  __v4sf __sfb = __builtin_ia32_cvtpi2ps (__sfa, (__v2si)__B);
+  return (__m128) __builtin_ia32_movlhps (__sfa, __sfb);
+}
+
+/* Convert the four SPFP values in A to four signed 16-bit integers.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_pi16(__m128 __A)
+{
+  __v4sf __hisf = (__v4sf)__A;
+  __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf);
+  __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf);
+  __v2si __losi = __builtin_ia32_cvtps2pi (__losf);
+  return (__m64) __builtin_ia32_packssdw (__hisi, __losi);
+}
+
+/* Convert the four SPFP values in A to four signed 8-bit integers.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtps_pi8(__m128 __A)
+{
+  __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A);
+  return (__m64) __builtin_ia32_packsswb (__tmp, (__v4hi)0LL);
+}
+
+/* Selects four specific SPFP values from A and B based on MASK.  */
+#ifdef __OPTIMIZE__
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask)
+{
+  return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask);
+}
+#else
+#define _mm_shuffle_ps(A, B, MASK)                                     \
+  ((__m128) __builtin_ia32_shufps ((__v4sf)(__m128)(A),                        \
+                                  (__v4sf)(__m128)(B), (int)(MASK)))
+#endif
+
+/* Selects and interleaves the upper two SPFP values from A and B.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpackhi_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B);
+}
+
+/* Selects and interleaves the lower two SPFP values from A and B.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_unpacklo_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B);
+}
+
+/* Sets the upper two SPFP values with 64-bits of data loaded from P;
+   the lower two values are passed through from A.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadh_pi (__m128 __A, __m64 const *__P)
+{
+  return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (const __v2sf *)__P);
+}
+
+/* Stores the upper two SPFP values of A into P.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeh_pi (__m64 *__P, __m128 __A)
+{
+  __builtin_ia32_storehps ((__v2sf *)__P, (__v4sf)__A);
+}
+
+/* Moves the upper two values of B into the lower two values of A.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movehl_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B);
+}
+
+/* Moves the lower two values of B into the upper two values of A.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movelh_ps (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B);
+}
+
+/* Sets the lower two SPFP values with 64-bits of data loaded from P;
+   the upper two values are passed through from A.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadl_pi (__m128 __A, __m64 const *__P)
+{
+  return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (const __v2sf *)__P);
+}
+
+/* Stores the lower two SPFP values of A into P.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storel_pi (__m64 *__P, __m128 __A)
+{
+  __builtin_ia32_storelps ((__v2sf *)__P, (__v4sf)__A);
+}
+
+/* Creates a 4-bit mask from the most significant bits of the SPFP values.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movemask_ps (__m128 __A)
+{
+  return __builtin_ia32_movmskps ((__v4sf)__A);
+}
+
+/* Return the contents of the control register.  */
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_getcsr (void)
+{
+  return __builtin_ia32_stmxcsr ();
+}
+
+/* Read exception bits from the control register.  */
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_MM_GET_EXCEPTION_STATE (void)
+{
+  return _mm_getcsr() & _MM_EXCEPT_MASK;
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_MM_GET_EXCEPTION_MASK (void)
+{
+  return _mm_getcsr() & _MM_MASK_MASK;
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_MM_GET_ROUNDING_MODE (void)
+{
+  return _mm_getcsr() & _MM_ROUND_MASK;
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_MM_GET_FLUSH_ZERO_MODE (void)
+{
+  return _mm_getcsr() & _MM_FLUSH_ZERO_MASK;
+}
+
+/* Set the control register to I.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setcsr (unsigned int __I)
+{
+  __builtin_ia32_ldmxcsr (__I);
+}
+
+/* Set exception bits in the control register.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_MM_SET_EXCEPTION_STATE(unsigned int __mask)
+{
+  _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_MM_SET_EXCEPTION_MASK (unsigned int __mask)
+{
+  _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_MM_SET_ROUNDING_MODE (unsigned int __mode)
+{
+  _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_MM_SET_FLUSH_ZERO_MODE (unsigned int __mode)
+{
+  _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode);
+}
+
+/* Create a vector with element 0 as F and the rest zero.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_ss (float __F)
+{
+  return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
+}
+
+/* Create a vector with all four elements equal to F.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set1_ps (float __F)
+{
+  return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_ps1 (float __F)
+{
+  return _mm_set1_ps (__F);
+}
+
+/* Create a vector with element 0 as *P and the rest zero.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_ss (float const *__P)
+{
+  return _mm_set_ss (*__P);
+}
+
+/* Create a vector with all four elements equal to *P.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load1_ps (float const *__P)
+{
+  return _mm_set1_ps (*__P);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_ps1 (float const *__P)
+{
+  return _mm_load1_ps (__P);
+}
+
+/* Load four SPFP values from P.  The address must be 16-byte aligned.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_load_ps (float const *__P)
+{
+  return *(__m128 *)__P;
+}
+
+/* Load four SPFP values from P.  The address need not be 16-byte aligned.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadu_ps (float const *__P)
+{
+  return *(__m128_u *)__P;
+}
+
+/* Load four SPFP values in reverse order.  The address must be aligned.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loadr_ps (float const *__P)
+{
+  __v4sf __tmp = *(__v4sf *)__P;
+  return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3));
+}
+
+/* Create the vector [Z Y X W].  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
+{
+  return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
+}
+
+/* Create the vector [W X Y Z].  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_setr_ps (float __Z, float __Y, float __X, float __W)
+{
+  return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
+}
+
+/* Stores the lower SPFP value.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_ss (float *__P, __m128 __A)
+{
+  *__P = ((__v4sf)__A)[0];
+}
+
+extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtss_f32 (__m128 __A)
+{
+  return ((__v4sf)__A)[0];
+}
+
+/* Store four SPFP values.  The address must be 16-byte aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_ps (float *__P, __m128 __A)
+{
+  *(__m128 *)__P = __A;
+}
+
+/* Store four SPFP values.  The address need not be 16-byte aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storeu_ps (float *__P, __m128 __A)
+{
+  *(__m128_u *)__P = __A;
+}
+
+/* Store the lower SPFP value across four words.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store1_ps (float *__P, __m128 __A)
+{
+  __v4sf __va = (__v4sf)__A;
+  __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0));
+  _mm_storeu_ps (__P, __tmp);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_store_ps1 (float *__P, __m128 __A)
+{
+  _mm_store1_ps (__P, __A);
+}
+
+/* Store four SPFP values in reverse order.  The address must be aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_storer_ps (float *__P, __m128 __A)
+{
+  __v4sf __va = (__v4sf)__A;
+  __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3));
+  _mm_store_ps (__P, __tmp);
+}
+
+/* Sets the low SPFP value of A from the low value of B.  */
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_move_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_shuffle ((__v4sf)__A, (__v4sf)__B,
+                                     __extension__
+                                     (__attribute__((__vector_size__ (16))) int)
+                                     {4,1,2,3});
+}
+
+/* Extracts one of the four words of A.  The selector N must be immediate.  */
+#ifdef __OPTIMIZE__
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_pi16 (__m64 const __A, int const __N)
+{
+  return (unsigned short) __builtin_ia32_vec_ext_v4hi ((__v4hi)__A, __N);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pextrw (__m64 const __A, int const __N)
+{
+  return _mm_extract_pi16 (__A, __N);
+}
+#else
+#define _mm_extract_pi16(A, N) \
+  ((int) (unsigned short) __builtin_ia32_vec_ext_v4hi ((__v4hi)(__m64)(A), (int)(N)))
+
+#define _m_pextrw(A, N) _mm_extract_pi16(A, N)
+#endif
+
+/* Inserts word D into one of four words of A.  The selector N must be
+   immediate.  */
+#ifdef __OPTIMIZE__
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
+{
+  return (__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)__A, __D, __N);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pinsrw (__m64 const __A, int const __D, int const __N)
+{
+  return _mm_insert_pi16 (__A, __D, __N);
+}
+#else
+#define _mm_insert_pi16(A, D, N)                               \
+  ((__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)(__m64)(A),    \
+                                       (int)(D), (int)(N)))
+
+#define _m_pinsrw(A, D, N) _mm_insert_pi16(A, D, N)
+#endif
+
+/* Compute the element-wise maximum of signed 16-bit values.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_pi16 (__m64 __A, __m64 __B)
+{
+  return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmaxsw (__m64 __A, __m64 __B)
+{
+  return _mm_max_pi16 (__A, __B);
+}
+
+/* Compute the element-wise maximum of unsigned 8-bit values.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_pu8 (__m64 __A, __m64 __B)
+{
+  return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmaxub (__m64 __A, __m64 __B)
+{
+  return _mm_max_pu8 (__A, __B);
+}
+
+/* Compute the element-wise minimum of signed 16-bit values.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_pi16 (__m64 __A, __m64 __B)
+{
+  return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pminsw (__m64 __A, __m64 __B)
+{
+  return _mm_min_pi16 (__A, __B);
+}
+
+/* Compute the element-wise minimum of unsigned 8-bit values.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_pu8 (__m64 __A, __m64 __B)
+{
+  return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pminub (__m64 __A, __m64 __B)
+{
+  return _mm_min_pu8 (__A, __B);
+}
+
+/* Create an 8-bit mask of the signs of 8-bit values.  */
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movemask_pi8 (__m64 __A)
+{
+  return __builtin_ia32_pmovmskb ((__v8qi)__A);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmovmskb (__m64 __A)
+{
+  return _mm_movemask_pi8 (__A);
+}
+
+/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
+   in B and produce the high 16 bits of the 32-bit results.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhi_pu16 (__m64 __A, __m64 __B)
+{
+  return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pmulhuw (__m64 __A, __m64 __B)
+{
+  return _mm_mulhi_pu16 (__A, __B);
+}
+
+/* Return a combination of the four 16-bit values in A.  The selector
+   must be an immediate.  */
+#ifdef __OPTIMIZE__
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_pi16 (__m64 __A, int const __N)
+{
+  return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pshufw (__m64 __A, int const __N)
+{
+  return _mm_shuffle_pi16 (__A, __N);
+}
+#else
+#define _mm_shuffle_pi16(A, N) \
+  ((__m64) __builtin_ia32_pshufw ((__v4hi)(__m64)(A), (int)(N)))
+
+#define _m_pshufw(A, N) _mm_shuffle_pi16 (A, N)
+#endif
+
+/* Conditionally store byte elements of A into P.  The high bit of each
+   byte in the selector N determines whether the corresponding byte from
+   A is stored.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
+{
+#ifdef __MMX_WITH_SSE__
+  /* Emulate MMX maskmovq with SSE2 maskmovdqu and handle unmapped bits
+     64:127 at address __P.  */
+  typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+  typedef char __v16qi __attribute__ ((__vector_size__ (16)));
+  /* Zero-extend __A and __N to 128 bits.  */
+  __v2di __A128 = __extension__ (__v2di) { ((__v1di) __A)[0], 0 };
+  __v2di __N128 = __extension__ (__v2di) { ((__v1di) __N)[0], 0 };
+
+  /* Check the alignment of __P.  */
+  __SIZE_TYPE__ offset = ((__SIZE_TYPE__) __P) & 0xf;
+  if (offset)
+    {
+      /* If the misalignment of __P > 8, subtract __P by 8 bytes.
+        Otherwise, subtract __P by the misalignment.  */
+      if (offset > 8)
+       offset = 8;
+      __P = (char *) (((__SIZE_TYPE__) __P) - offset);
+
+      /* Shift __A128 and __N128 to the left by the adjustment.  */
+      switch (offset)
+       {
+       case 1:
+         __A128 = __builtin_ia32_pslldqi128 (__A128, 8);
+         __N128 = __builtin_ia32_pslldqi128 (__N128, 8);
+         break;
+       case 2:
+         __A128 = __builtin_ia32_pslldqi128 (__A128, 2 * 8);
+         __N128 = __builtin_ia32_pslldqi128 (__N128, 2 * 8);
+         break;
+       case 3:
+         __A128 = __builtin_ia32_pslldqi128 (__A128, 3 * 8);
+         __N128 = __builtin_ia32_pslldqi128 (__N128, 3 * 8);
+         break;
+       case 4:
+         __A128 = __builtin_ia32_pslldqi128 (__A128, 4 * 8);
+         __N128 = __builtin_ia32_pslldqi128 (__N128, 4 * 8);
+         break;
+       case 5:
+         __A128 = __builtin_ia32_pslldqi128 (__A128, 5 * 8);
+         __N128 = __builtin_ia32_pslldqi128 (__N128, 5 * 8);
+         break;
+       case 6:
+         __A128 = __builtin_ia32_pslldqi128 (__A128, 6 * 8);
+         __N128 = __builtin_ia32_pslldqi128 (__N128, 6 * 8);
+         break;
+       case 7:
+         __A128 = __builtin_ia32_pslldqi128 (__A128, 7 * 8);
+         __N128 = __builtin_ia32_pslldqi128 (__N128, 7 * 8);
+         break;
+       case 8:
+         __A128 = __builtin_ia32_pslldqi128 (__A128, 8 * 8);
+         __N128 = __builtin_ia32_pslldqi128 (__N128, 8 * 8);
+         break;
+       default:
+         break;
+       }
+    }
+  __builtin_ia32_maskmovdqu ((__v16qi)__A128, (__v16qi)__N128, __P);
+#else
+  __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P);
+#endif
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_maskmovq (__m64 __A, __m64 __N, char *__P)
+{
+  _mm_maskmove_si64 (__A, __N, __P);
+}
+
+/* Compute the rounded averages of the unsigned 8-bit values in A and B.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avg_pu8 (__m64 __A, __m64 __B)
+{
+  return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pavgb (__m64 __A, __m64 __B)
+{
+  return _mm_avg_pu8 (__A, __B);
+}
+
+/* Compute the rounded averages of the unsigned 16-bit values in A and B.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_avg_pu16 (__m64 __A, __m64 __B)
+{
+  return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_pavgw (__m64 __A, __m64 __B)
+{
+  return _mm_avg_pu16 (__A, __B);
+}
+
+/* Compute the sum of the absolute differences of the unsigned 8-bit
+   values in A and B.  Return the value in the lower 16-bit word; the
+   upper words are cleared.  */
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sad_pu8 (__m64 __A, __m64 __B)
+{
+  return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B);
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_m_psadbw (__m64 __A, __m64 __B)
+{
+  return _mm_sad_pu8 (__A, __B);
+}
+
+/* Stores the data in A to the address P without polluting the caches.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_pi (__m64 *__P, __m64 __A)
+{
+  __builtin_ia32_movntq ((unsigned long long *)__P, (unsigned long long)__A);
+}
+
+/* Likewise.  The address must be 16-byte aligned.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_stream_ps (float *__P, __m128 __A)
+{
+  __builtin_ia32_movntps (__P, (__v4sf)__A);
+}
+
+/* Guarantees that every preceding store is globally visible before
+   any subsequent store.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sfence (void)
+{
+  __builtin_ia32_sfence ();
+}
+
+/* Transpose the 4x4 matrix composed of row[0-3].  */
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)                      \
+do {                                                                   \
+  __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3);   \
+  __v4sf __t0 = __builtin_ia32_unpcklps (__r0, __r1);                  \
+  __v4sf __t1 = __builtin_ia32_unpcklps (__r2, __r3);                  \
+  __v4sf __t2 = __builtin_ia32_unpckhps (__r0, __r1);                  \
+  __v4sf __t3 = __builtin_ia32_unpckhps (__r2, __r3);                  \
+  (row0) = __builtin_ia32_movlhps (__t0, __t1);                                \
+  (row1) = __builtin_ia32_movhlps (__t1, __t0);                                \
+  (row2) = __builtin_ia32_movlhps (__t2, __t3);                                \
+  (row3) = __builtin_ia32_movhlps (__t3, __t2);                                \
+} while (0)
+
+/* For backward source compatibility.  */
+# include <emmintrin.h>
+
+#ifdef __DISABLE_SSE__
+#undef __DISABLE_SSE__
+#pragma GCC pop_options
+#endif /* __DISABLE_SSE__ */
+
+/* The execution of the next instruction is delayed by an implementation
+   specific amount of time.  The instruction does not modify the
+   architectural state.  This is after the pop_options pragma because
+   it does not require SSE support in the processor--the encoding is a
+   nop on processors that do not support it.  */
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_pause (void)
+{
+  __builtin_ia32_pause ();
+}
+
+#endif /* _XMMINTRIN_H_INCLUDED */
diff --git a/include-gcc/xopintrin.h b/include-gcc/xopintrin.h
new file mode 100644 (file)
index 0000000..39a03bf
--- /dev/null
@@ -0,0 +1,850 @@
+/* Copyright (C) 2007-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86INTRIN_H_INCLUDED
+# error "Never use <xopintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _XOPMMINTRIN_H_INCLUDED
+#define _XOPMMINTRIN_H_INCLUDED
+
+#include <fma4intrin.h>
+
+#ifndef __XOP__
+#pragma GCC push_options
+#pragma GCC target("xop")
+#define __DISABLE_XOP__
+#endif /* __XOP__ */
+
+/* Integer multiply/add instructions. */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i) __builtin_ia32_vpmacssww ((__v8hi)__A,(__v8hi)__B, (__v8hi)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_macc_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return (__m128i) __builtin_ia32_vpmacsww ((__v8hi)__A, (__v8hi)__B, (__v8hi)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maccsd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_vpmacsswd ((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maccd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_vpmacswd ((__v8hi)__A, (__v8hi)__B, (__v4si)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maccs_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_vpmacssdd ((__v4si)__A, (__v4si)__B, (__v4si)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_macc_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_vpmacsdd ((__v4si)__A, (__v4si)__B, (__v4si)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maccslo_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_vpmacssdql ((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_macclo_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_vpmacsdql ((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maccshi_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_vpmacssdqh ((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_macchi_epi32(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_vpmacsdqh ((__v4si)__A, (__v4si)__B, (__v2di)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maddsd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_vpmadcsswd ((__v8hi)__A,(__v8hi)__B,(__v4si)__C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maddd_epi16(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_vpmadcswd ((__v8hi)__A,(__v8hi)__B,(__v4si)__C);
+}
+
+/* Packed Integer Horizontal Add and Subtract */
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddw_epi8(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_vphaddbw ((__v16qi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddd_epi8(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_vphaddbd ((__v16qi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddq_epi8(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_vphaddbq ((__v16qi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddd_epi16(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_vphaddwd ((__v8hi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddq_epi16(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_vphaddwq ((__v8hi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddq_epi32(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_vphadddq ((__v4si)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddw_epu8(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_vphaddubw ((__v16qi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddd_epu8(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_vphaddubd ((__v16qi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddq_epu8(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_vphaddubq ((__v16qi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddd_epu16(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_vphadduwd ((__v8hi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddq_epu16(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_vphadduwq ((__v8hi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_haddq_epu32(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_vphaddudq ((__v4si)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsubw_epi8(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_vphsubbw ((__v16qi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsubd_epi16(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_vphsubwd ((__v8hi)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsubq_epi32(__m128i __A)
+{
+  return  (__m128i) __builtin_ia32_vphsubdq ((__v4si)__A);
+}
+
+/* Vector conditional move and permute */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_vpcmov (__A, __B, __C);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C)
+{
+  return  (__m256i) __builtin_ia32_vpcmov256 (__A, __B, __C);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_perm_epi8(__m128i __A, __m128i __B, __m128i __C)
+{
+  return  (__m128i) __builtin_ia32_vpperm ((__v16qi)__A, (__v16qi)__B, (__v16qi)__C);
+}
+
+/* Packed Integer Rotates and Shifts
+   Rotates - Non-Immediate form */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rot_epi8(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_vprotb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rot_epi16(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_vprotw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rot_epi32(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_vprotd ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_rot_epi64(__m128i __A,  __m128i __B)
+{
+  return (__m128i)  __builtin_ia32_vprotq ((__v2di)__A, (__v2di)__B);
+}
+
+/* Rotates - Immediate form */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roti_epi8(__m128i __A, const int __B)
+{
+  return  (__m128i) __builtin_ia32_vprotbi ((__v16qi)__A, __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roti_epi16(__m128i __A, const int __B)
+{
+  return  (__m128i) __builtin_ia32_vprotwi ((__v8hi)__A, __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roti_epi32(__m128i __A, const int __B)
+{
+  return  (__m128i) __builtin_ia32_vprotdi ((__v4si)__A, __B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_roti_epi64(__m128i __A, const int __B)
+{
+  return  (__m128i) __builtin_ia32_vprotqi ((__v2di)__A, __B);
+}
+#else
+#define _mm_roti_epi8(A, N) \
+  ((__m128i) __builtin_ia32_vprotbi ((__v16qi)(__m128i)(A), (int)(N)))
+#define _mm_roti_epi16(A, N) \
+  ((__m128i) __builtin_ia32_vprotwi ((__v8hi)(__m128i)(A), (int)(N)))
+#define _mm_roti_epi32(A, N) \
+  ((__m128i) __builtin_ia32_vprotdi ((__v4si)(__m128i)(A), (int)(N)))
+#define _mm_roti_epi64(A, N) \
+  ((__m128i) __builtin_ia32_vprotqi ((__v2di)(__m128i)(A), (int)(N)))
+#endif
+
+/* Shifts */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shl_epi8(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_vpshlb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shl_epi16(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_vpshlw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shl_epi32(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_vpshld ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shl_epi64(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_vpshlq ((__v2di)__A, (__v2di)__B);
+}
+
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha_epi8(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_vpshab ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha_epi16(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_vpshaw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha_epi32(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_vpshad ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sha_epi64(__m128i __A,  __m128i __B)
+{
+  return  (__m128i) __builtin_ia32_vpshaq ((__v2di)__A, (__v2di)__B);
+}
+
+/* Compare and Predicate Generation
+   pcom (integer, unsigned bytes) */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comlt_epu8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomltub ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comle_epu8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomleub ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comgt_epu8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomgtub ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comge_epu8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomgeub ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comeq_epu8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomequb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comneq_epu8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomnequb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comfalse_epu8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomfalseub ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comtrue_epu8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomtrueub ((__v16qi)__A, (__v16qi)__B);
+}
+
+/*pcom (integer, unsigned words) */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comlt_epu16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomltuw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comle_epu16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomleuw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comgt_epu16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomgtuw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comge_epu16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomgeuw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comeq_epu16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomequw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comneq_epu16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomnequw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comfalse_epu16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomfalseuw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comtrue_epu16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomtrueuw ((__v8hi)__A, (__v8hi)__B);
+}
+
+/*pcom (integer, unsigned double words) */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comlt_epu32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomltud ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comle_epu32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomleud ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comgt_epu32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomgtud ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comge_epu32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomgeud ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comeq_epu32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomequd ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comneq_epu32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomnequd ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comfalse_epu32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomfalseud ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comtrue_epu32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomtrueud ((__v4si)__A, (__v4si)__B);
+}
+
+/*pcom (integer, unsigned quad words) */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comlt_epu64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomltuq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comle_epu64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomleuq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comgt_epu64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomgtuq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comge_epu64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomgeuq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comeq_epu64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomequq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comneq_epu64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomnequq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comfalse_epu64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomfalseuq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comtrue_epu64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomtrueuq ((__v2di)__A, (__v2di)__B);
+}
+
+/*pcom (integer, signed bytes) */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comlt_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomltb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comle_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomleb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comgt_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomgtb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comge_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomgeb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comeq_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomeqb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comneq_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomneqb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comfalse_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomfalseb ((__v16qi)__A, (__v16qi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comtrue_epi8(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomtrueb ((__v16qi)__A, (__v16qi)__B);
+}
+
+/*pcom (integer, signed words) */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comlt_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomltw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comle_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomlew ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comgt_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomgtw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comge_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomgew ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comeq_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomeqw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comneq_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomneqw ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comfalse_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomfalsew ((__v8hi)__A, (__v8hi)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comtrue_epi16(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomtruew ((__v8hi)__A, (__v8hi)__B);
+}
+
+/*pcom (integer, signed double words) */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comlt_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomltd ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comle_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomled ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comgt_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomgtd ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comge_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomged ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comeq_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomeqd ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comneq_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomneqd ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comfalse_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomfalsed ((__v4si)__A, (__v4si)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comtrue_epi32(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomtrued ((__v4si)__A, (__v4si)__B);
+}
+
+/*pcom (integer, signed quad words) */
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comlt_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomltq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comle_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomleq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comgt_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomgtq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comge_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomgeq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comeq_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomeqq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comneq_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomneqq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comfalse_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomfalseq ((__v2di)__A, (__v2di)__B);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_comtrue_epi64(__m128i __A, __m128i __B)
+{
+  return (__m128i) __builtin_ia32_vpcomtrueq ((__v2di)__A, (__v2di)__B);
+}
+
+/* FRCZ */
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_frcz_ps (__m128 __A)
+{
+  return (__m128) __builtin_ia32_vfrczps ((__v4sf)__A);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_frcz_pd (__m128d __A)
+{
+  return (__m128d) __builtin_ia32_vfrczpd ((__v2df)__A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_frcz_ss (__m128 __A, __m128 __B)
+{
+  return (__m128) __builtin_ia32_movss ((__v4sf)__A,
+                                       (__v4sf)
+                                       __builtin_ia32_vfrczss ((__v4sf)__B));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_frcz_sd (__m128d __A, __m128d __B)
+{
+  return (__m128d) __builtin_ia32_movsd ((__v2df)__A,
+                                        (__v2df)
+                                        __builtin_ia32_vfrczsd ((__v2df)__B));
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_frcz_ps (__m256 __A)
+{
+  return (__m256) __builtin_ia32_vfrczps256 ((__v8sf)__A);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_frcz_pd (__m256d __A)
+{
+  return (__m256d) __builtin_ia32_vfrczpd256 ((__v4df)__A);
+}
+
+/* PERMIL2 */
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permute2_pd (__m128d __X, __m128d __Y, __m128i __C, const int __I)
+{
+  return (__m128d) __builtin_ia32_vpermil2pd ((__v2df)__X,
+                                             (__v2df)__Y,
+                                             (__v2di)__C,
+                                             __I);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute2_pd (__m256d __X, __m256d __Y, __m256i __C, const int __I)
+{
+  return (__m256d) __builtin_ia32_vpermil2pd256 ((__v4df)__X,
+                                                (__v4df)__Y,
+                                                (__v4di)__C,
+                                                __I);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permute2_ps (__m128 __X, __m128 __Y, __m128i __C, const int __I)
+{
+  return (__m128) __builtin_ia32_vpermil2ps ((__v4sf)__X,
+                                            (__v4sf)__Y,
+                                            (__v4si)__C,
+                                            __I);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute2_ps (__m256 __X, __m256 __Y, __m256i __C, const int __I)
+{
+  return (__m256) __builtin_ia32_vpermil2ps256 ((__v8sf)__X,
+                                               (__v8sf)__Y,
+                                               (__v8si)__C,
+                                               __I);
+}
+#else
+#define _mm_permute2_pd(X, Y, C, I)                                    \
+  ((__m128d) __builtin_ia32_vpermil2pd ((__v2df)(__m128d)(X),          \
+                                       (__v2df)(__m128d)(Y),           \
+                                       (__v2di)(__m128i)(C),           \
+                                       (int)(I)))
+
+#define _mm256_permute2_pd(X, Y, C, I)                                 \
+  ((__m256d) __builtin_ia32_vpermil2pd256 ((__v4df)(__m256d)(X),       \
+                                          (__v4df)(__m256d)(Y),        \
+                                          (__v4di)(__m256i)(C),        \
+                                          (int)(I)))
+
+#define _mm_permute2_ps(X, Y, C, I)                                    \
+  ((__m128) __builtin_ia32_vpermil2ps ((__v4sf)(__m128)(X),            \
+                                      (__v4sf)(__m128)(Y),             \
+                                      (__v4si)(__m128i)(C),            \
+                                      (int)(I)))
+
+#define _mm256_permute2_ps(X, Y, C, I)                                 \
+  ((__m256) __builtin_ia32_vpermil2ps256 ((__v8sf)(__m256)(X),         \
+                                         (__v8sf)(__m256)(Y),          \
+                                         (__v8si)(__m256i)(C),         \
+                                         (int)(I)))
+#endif /* __OPTIMIZE__ */
+
+#ifdef __DISABLE_XOP__
+#undef __DISABLE_XOP__
+#pragma GCC pop_options
+#endif /* __DISABLE_XOP__ */
+
+#endif /* _XOPMMINTRIN_H_INCLUDED */
diff --git a/include-gcc/xsavecintrin.h b/include-gcc/xsavecintrin.h
new file mode 100644 (file)
index 0000000..185863a
--- /dev/null
@@ -0,0 +1,58 @@
+/* Copyright (C) 2014-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <xsavecintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _XSAVECINTRIN_H_INCLUDED
+#define _XSAVECINTRIN_H_INCLUDED
+
+#ifndef __XSAVEC__
+#pragma GCC push_options
+#pragma GCC target("xsavec")
+#define __DISABLE_XSAVEC__
+#endif /* __XSAVEC__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xsavec (void *__P, long long __M)
+{
+  __builtin_ia32_xsavec (__P, __M);
+}
+
+#ifdef __x86_64__
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xsavec64 (void *__P, long long __M)
+{
+  __builtin_ia32_xsavec64 (__P, __M);
+}
+#endif
+
+#ifdef __DISABLE_XSAVEC__
+#undef __DISABLE_XSAVEC__
+#pragma GCC pop_options
+#endif /* __DISABLE_XSAVEC__ */
+
+#endif /* _XSAVECINTRIN_H_INCLUDED */
diff --git a/include-gcc/xsaveintrin.h b/include-gcc/xsaveintrin.h
new file mode 100644 (file)
index 0000000..092b1fe
--- /dev/null
@@ -0,0 +1,86 @@
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <xsaveintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _XSAVEINTRIN_H_INCLUDED
+#define _XSAVEINTRIN_H_INCLUDED
+
+#ifndef __XSAVE__
+#pragma GCC push_options
+#pragma GCC target("xsave")
+#define __DISABLE_XSAVE__
+#endif /* __XSAVE__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xsave (void *__P, long long __M)
+{
+  __builtin_ia32_xsave (__P, __M);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xrstor (void *__P, long long __M)
+{
+  __builtin_ia32_xrstor (__P, __M);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xsetbv (unsigned int __A, long long __V)
+{
+  __builtin_ia32_xsetbv (__A, __V);
+}
+
+extern __inline long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xgetbv (unsigned int __A)
+{
+  return __builtin_ia32_xgetbv (__A);
+}
+
+#ifdef __x86_64__
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xsave64 (void *__P, long long __M)
+{
+  __builtin_ia32_xsave64 (__P, __M);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xrstor64 (void *__P, long long __M)
+{
+  __builtin_ia32_xrstor64 (__P, __M);
+}
+#endif
+
+#ifdef __DISABLE_XSAVE__
+#undef __DISABLE_XSAVE__
+#pragma GCC pop_options
+#endif /* __DISABLE_XSAVE__ */
+
+#endif /* _XSAVEINTRIN_H_INCLUDED */
diff --git a/include-gcc/xsaveoptintrin.h b/include-gcc/xsaveoptintrin.h
new file mode 100644 (file)
index 0000000..337b006
--- /dev/null
@@ -0,0 +1,58 @@
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <xsaveoptintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _XSAVEOPTINTRIN_H_INCLUDED
+#define _XSAVEOPTINTRIN_H_INCLUDED
+
+#ifndef __XSAVEOPT__
+#pragma GCC push_options
+#pragma GCC target("xsaveopt")
+#define __DISABLE_XSAVEOPT__
+#endif /* __XSAVEOPT__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xsaveopt (void *__P, long long __M)
+{
+  __builtin_ia32_xsaveopt (__P, __M);
+}
+
+#ifdef __x86_64__
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xsaveopt64 (void *__P, long long __M)
+{
+  __builtin_ia32_xsaveopt64 (__P, __M);
+}
+#endif
+
+#ifdef __DISABLE_XSAVEOPT__
+#undef __DISABLE_XSAVEOPT__
+#pragma GCC pop_options
+#endif /* __DISABLE_XSAVEOPT__ */
+
+#endif /* _XSAVEOPTINTRIN_H_INCLUDED */
diff --git a/include-gcc/xsavesintrin.h b/include-gcc/xsavesintrin.h
new file mode 100644 (file)
index 0000000..6a230d0
--- /dev/null
@@ -0,0 +1,72 @@
+/* Copyright (C) 2014-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <xsavesintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _XSAVESINTRIN_H_INCLUDED
+#define _XSAVESINTRIN_H_INCLUDED
+
+#ifndef __XSAVES__
+#pragma GCC push_options
+#pragma GCC target("xsaves")
+#define __DISABLE_XSAVES__
+#endif /* __XSAVES__ */
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xsaves (void *__P, long long __M)
+{
+  __builtin_ia32_xsaves (__P, __M);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xrstors (void *__P, long long __M)
+{
+  __builtin_ia32_xrstors (__P, __M);
+}
+
+#ifdef __x86_64__
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xrstors64 (void *__P, long long __M)
+{
+  __builtin_ia32_xrstors64 (__P, __M);
+}
+
+extern __inline void
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xsaves64 (void *__P, long long __M)
+{
+  __builtin_ia32_xsaves64 (__P, __M);
+}
+#endif
+
+#ifdef __DISABLE_XSAVES__
+#undef __DISABLE_XSAVES__
+#pragma GCC pop_options
+#endif /* __DISABLE_XSAVES__ */
+
+#endif /* _XSAVESINTRIN_H_INCLUDED */
diff --git a/include-gcc/xtestintrin.h b/include-gcc/xtestintrin.h
new file mode 100644 (file)
index 0000000..7216e80
--- /dev/null
@@ -0,0 +1,51 @@
+/* Copyright (C) 2012-2023 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 3, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   Under Section 7 of GPL version 3, you are granted additional
+   permissions described in the GCC Runtime Library Exception, version
+   3.1, as published by the Free Software Foundation.
+
+   You should have received a copy of the GNU General Public License and
+   a copy of the GCC Runtime Library Exception along with this program;
+   see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _X86GPRINTRIN_H_INCLUDED
+# error "Never use <xtestintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef _XTESTINTRIN_H_INCLUDED
+#define _XTESTINTRIN_H_INCLUDED
+
+#ifndef __RTM__
+#pragma GCC push_options
+#pragma GCC target("rtm")
+#define __DISABLE_RTM__
+#endif /* __RTM__ */
+
+/* Return non-zero if the instruction executes inside an RTM or HLE code
+   region.  Return zero otherwise.   */
+extern __inline int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_xtest (void)
+{
+  return __builtin_ia32_xtest ();
+}
+
+#ifdef __DISABLE_RTM__
+#undef __DISABLE_RTM__
+#pragma GCC pop_options
+#endif /* __DISABLE_RTM__ */
+
+#endif /* _XTESTINTRIN_H_INCLUDED */