From: Andrei Tatar Date: Mon, 17 Jul 2023 17:07:59 +0000 (+0200) Subject: Implement support for GCC X-Git-Tag: RELEASE-0.14.0~7 X-Git-Url: http://xenbits.xensource.com/gitweb?a=commitdiff_plain;h=5665f9972fbeb9a28cbf94132b89a253af91ac01;p=unikraft%2Flibs%2Fintel-intrinsics.git Implement support for GCC This change adds on-par support for GCC by providing the native x86 intrinsics headers from gcc release 13.1.0. The file `mm_malloc.h` is taken from upstream `pmm_malloc.h`. Signed-off-by: Andrei Tatar Reviewed-by: Maria Sfiraiala Reviewed-by: Radu Nichita Approved-by: Razvan Deaconescu Tested-by: Unikraft CI GitHub-Closes: #3 --- diff --git a/Makefile.uk b/Makefile.uk index b8cc585..fb83280 100644 --- a/Makefile.uk +++ b/Makefile.uk @@ -41,6 +41,9 @@ $(eval $(call addlib_s,libintel_intrinsics,$(CONFIG_LIBINTEL_INTRINSICS))) # Library includes ################################################################################ ifeq ($(CONFIG_LIBINTEL_INTRINSICS),y) +CINCLUDES-$(call have_gcc) += -I$(LIBINTEL_INTRINSICS_BASE)/include-gcc +CXXINCLUDES-$(call have_gcc) += -I$(LIBINTEL_INTRINSICS_BASE)/include-gcc + CINCLUDES-$(call have_clang) += -I$(LIBINTEL_INTRINSICS_BASE)/include-llvm CXXINCLUDES-$(call have_clang) += -I$(LIBINTEL_INTRINSICS_BASE)/include-llvm endif diff --git a/include-gcc/adxintrin.h b/include-gcc/adxintrin.h new file mode 100644 index 0000000..e7b9999 --- /dev/null +++ b/include-gcc/adxintrin.h @@ -0,0 +1,81 @@ +/* Copyright (C) 2012-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _ADXINTRIN_H_INCLUDED +#define _ADXINTRIN_H_INCLUDED + +extern __inline unsigned char +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_subborrow_u32 (unsigned char __CF, unsigned int __X, + unsigned int __Y, unsigned int *__P) +{ + return __builtin_ia32_sbb_u32 (__CF, __X, __Y, __P); +} + +extern __inline unsigned char +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_addcarry_u32 (unsigned char __CF, unsigned int __X, + unsigned int __Y, unsigned int *__P) +{ + return __builtin_ia32_addcarryx_u32 (__CF, __X, __Y, __P); +} + +extern __inline unsigned char +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_addcarryx_u32 (unsigned char __CF, unsigned int __X, + unsigned int __Y, unsigned int *__P) +{ + return __builtin_ia32_addcarryx_u32 (__CF, __X, __Y, __P); +} + +#ifdef __x86_64__ +extern __inline unsigned char +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_subborrow_u64 (unsigned char __CF, unsigned long long __X, + unsigned long long __Y, unsigned long long *__P) +{ + return __builtin_ia32_sbb_u64 (__CF, __X, __Y, __P); +} + +extern __inline unsigned char +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_addcarry_u64 (unsigned char __CF, unsigned long long __X, + unsigned long long __Y, unsigned long long *__P) +{ + return __builtin_ia32_addcarryx_u64 (__CF, __X, __Y, __P); +} + +extern __inline unsigned char +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_addcarryx_u64 (unsigned char __CF, unsigned long long __X, + unsigned long long __Y, unsigned long long *__P) +{ + return __builtin_ia32_addcarryx_u64 (__CF, __X, __Y, __P); +} +#endif + +#endif /* _ADXINTRIN_H_INCLUDED */ diff --git a/include-gcc/ammintrin.h b/include-gcc/ammintrin.h new file mode 100644 index 0000000..24cda1f --- /dev/null +++ b/include-gcc/ammintrin.h @@ -0,0 +1,93 @@ +/* Copyright (C) 2007-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* Implemented from the specification included in the AMD Programmers + Manual Update, version 2.x */ + +#ifndef _AMMINTRIN_H_INCLUDED +#define _AMMINTRIN_H_INCLUDED + +/* We need definitions from the SSE3, SSE2 and SSE header files*/ +#include + +#ifndef __SSE4A__ +#pragma GCC push_options +#pragma GCC target("sse4a") +#define __DISABLE_SSE4A__ +#endif /* __SSE4A__ */ + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_stream_sd (double * __P, __m128d __Y) +{ + __builtin_ia32_movntsd (__P, (__v2df) __Y); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_stream_ss (float * __P, __m128 __Y) +{ + __builtin_ia32_movntss (__P, (__v4sf) __Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_si64 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_extrq ((__v2di) __X, (__v16qi) __Y); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extracti_si64 (__m128i __X, unsigned const int __I, unsigned const int __L) +{ + return (__m128i) __builtin_ia32_extrqi ((__v2di) __X, __I, __L); +} +#else +#define _mm_extracti_si64(X, I, L) \ + ((__m128i) __builtin_ia32_extrqi ((__v2di)(__m128i)(X), \ + (unsigned int)(I), (unsigned int)(L))) +#endif + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_si64 (__m128i __X,__m128i __Y) +{ + return (__m128i) __builtin_ia32_insertq ((__v2di)__X, (__v2di)__Y); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_inserti_si64(__m128i __X, __m128i __Y, unsigned const int __I, unsigned const int __L) +{ + return (__m128i) __builtin_ia32_insertqi ((__v2di)__X, (__v2di)__Y, __I, __L); +} +#else +#define _mm_inserti_si64(X, Y, I, L) \ + ((__m128i) __builtin_ia32_insertqi ((__v2di)(__m128i)(X), \ + (__v2di)(__m128i)(Y), \ + (unsigned int)(I), (unsigned int)(L))) +#endif + +#ifdef __DISABLE_SSE4A__ +#undef __DISABLE_SSE4A__ +#pragma GCC pop_options +#endif /* __DISABLE_SSE4A__ */ + +#endif /* _AMMINTRIN_H_INCLUDED */ diff --git a/include-gcc/amxbf16intrin.h b/include-gcc/amxbf16intrin.h new file mode 100644 index 0000000..33ee234 --- /dev/null +++ b/include-gcc/amxbf16intrin.h @@ -0,0 +1,52 @@ +/* Copyright (C) 2020-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AMXBF16INTRIN_H_INCLUDED +#define _AMXBF16INTRIN_H_INCLUDED + +#if !defined(__AMX_BF16__) +#pragma GCC push_options +#pragma GCC target("amx-bf16") +#define __DISABLE_AMX_BF16__ +#endif /* __AMX_BF16__ */ + +#if defined(__x86_64__) +#define _tile_dpbf16ps_internal(dst,src1,src2) \ + __asm__ volatile\ + ("{tdpbf16ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdpbf16ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::) + +#define _tile_dpbf16ps(dst,src1,src2) \ + _tile_dpbf16ps_internal (dst, src1, src2) + +#endif + +#ifdef __DISABLE_AMX_BF16__ +#undef __DISABLE_AMX_BF16__ +#pragma GCC pop_options +#endif /* __DISABLE_AMX_BF16__ */ + +#endif /* _AMXBF16INTRIN_H_INCLUDED */ diff --git a/include-gcc/amxcomplexintrin.h b/include-gcc/amxcomplexintrin.h new file mode 100644 index 0000000..6ea1eca --- /dev/null +++ b/include-gcc/amxcomplexintrin.h @@ -0,0 +1,59 @@ +/* Copyright (C) 2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AMXCOMPLEXINTRIN_H_INCLUDED +#define _AMXCOMPLEXINTRIN_H_INCLUDED + +#if !defined(__AMX_COMPLEX__) +#pragma GCC push_options +#pragma GCC target("amx-complex") +#define __DISABLE_AMX_COMPLEX__ +#endif /* __AMX_COMPLEX__ */ + +#if defined(__x86_64__) +#define _tile_cmmimfp16ps_internal(src1_dst,src2,src3) \ + __asm__ volatile\ + ("{tcmmimfp16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|tcmmimfp16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::) + +#define _tile_cmmrlfp16ps_internal(src1_dst,src2,src3) \ + __asm__ volatile\ + ("{tcmmrlfp16ps\t%%tmm"#src3", %%tmm"#src2", %%tmm"#src1_dst"|tcmmrlfp16ps\t%%tmm"#src1_dst", %%tmm"#src2", %%tmm"#src3"}" ::) + +#define _tile_cmmimfp16ps(src1_dst,src2,src3) \ + _tile_cmmimfp16ps_internal (src1_dst, src2, src3) + +#define _tile_cmmrlfp16ps(src1_dst,src2,src3) \ + _tile_cmmrlfp16ps_internal (src1_dst, src2, src3) + +#endif + +#ifdef __DISABLE_AMX_COMPLEX__ +#undef __DISABLE_AMX_COMPLEX__ +#pragma GCC pop_options +#endif /* __DISABLE_AMX_COMPLEX__ */ + +#endif /* _AMXCOMPLEXINTRIN_H_INCLUDED */ diff --git a/include-gcc/amxfp16intrin.h b/include-gcc/amxfp16intrin.h new file mode 100644 index 0000000..340945b --- /dev/null +++ b/include-gcc/amxfp16intrin.h @@ -0,0 +1,46 @@ +/* Copyright (C) 2020-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AMXFP16INTRIN_H_INCLUDED +#define _AMXFP16INTRIN_H_INCLUDED + +#if defined(__x86_64__) +#define _tile_dpfp16ps_internal(dst,src1,src2) \ + __asm__ volatile \ + ("{tdpfp16ps\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|tdpfp16ps\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::) + +#define _tile_dpfp16ps(dst,src1,src2) \ + _tile_dpfp16ps_internal (dst,src1,src2) + +#endif + +#ifdef __DISABLE_AMX_FP16__ +#undef __DISABLE_AMX_FP16__ +#pragma GCC pop_options +#endif /* __DISABLE_AMX_FP16__ */ + +#endif /* _AMXFP16INTRIN_H_INCLUDED */ diff --git a/include-gcc/amxint8intrin.h b/include-gcc/amxint8intrin.h new file mode 100644 index 0000000..6b69cfb --- /dev/null +++ b/include-gcc/amxint8intrin.h @@ -0,0 +1,61 @@ +/* Copyright (C) 2020-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AMXINT8INTRIN_H_INCLUDED +#define _AMXINT8INTRIN_H_INCLUDED + +#if !defined(__AMX_INT8__) +#pragma GCC push_options +#pragma GCC target("amx-int8") +#define __DISABLE_AMX_INT8__ +#endif /* __AMX_INT8__ */ + +#if defined(__x86_64__) +#define _tile_int8_dp_internal(name,dst,src1,src2) \ + __asm__ volatile \ + ("{"#name"\t%%tmm"#src2", %%tmm"#src1", %%tmm"#dst"|"#name"\t%%tmm"#dst", %%tmm"#src1", %%tmm"#src2"}" ::) + +#define _tile_dpbssd(dst,src1,src2) \ + _tile_int8_dp_internal (tdpbssd, dst, src1, src2) + +#define _tile_dpbsud(dst,src1,src2) \ + _tile_int8_dp_internal (tdpbsud, dst, src1, src2) + +#define _tile_dpbusd(dst,src1,src2) \ + _tile_int8_dp_internal (tdpbusd, dst, src1, src2) + +#define _tile_dpbuud(dst,src1,src2) \ + _tile_int8_dp_internal (tdpbuud, dst, src1, src2) + +#endif + +#ifdef __DISABLE_AMX_INT8__ +#undef __DISABLE_AMX_INT8__ +#pragma GCC pop_options +#endif /* __DISABLE_AMX_INT8__ */ + +#endif /* _AMXINT8INTRIN_H_INCLUDED */ diff --git a/include-gcc/amxtileintrin.h b/include-gcc/amxtileintrin.h new file mode 100644 index 0000000..cc60226 --- /dev/null +++ b/include-gcc/amxtileintrin.h @@ -0,0 +1,98 @@ +/* Copyright (C) 2020-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AMXTILEINTRIN_H_INCLUDED +#define _AMXTILEINTRIN_H_INCLUDED + +#if !defined(__AMX_TILE__) +#pragma GCC push_options +#pragma GCC target("amx-tile") +#define __DISABLE_AMX_TILE__ +#endif /* __AMX_TILE__ */ + +#if defined(__x86_64__) +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_tile_loadconfig (const void *__config) +{ + __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config))); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_tile_storeconfig (void *__config) +{ + __asm__ volatile ("sttilecfg\t%X0" : "=m" (*((void **)__config))); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_tile_release (void) +{ + __asm__ volatile ("tilerelease" ::); +} + +#define _tile_loadd(dst,base,stride) \ + _tile_loadd_internal (dst, base, stride) + +#define _tile_loadd_internal(dst,base,stride) \ + __asm__ volatile \ + ("{tileloadd\t(%0,%1,1), %%tmm"#dst"|tileloadd\t%%tmm"#dst", [%0+%1*1]}" \ + :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride))) + +#define _tile_stream_loadd(dst,base,stride) \ + _tile_stream_loadd_internal (dst, base, stride) + +#define _tile_stream_loadd_internal(dst,base,stride) \ + __asm__ volatile \ + ("{tileloaddt1\t(%0,%1,1), %%tmm"#dst"|tileloaddt1\t%%tmm"#dst", [%0+%1*1]}" \ + :: "r" ((const void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride))) + +#define _tile_stored(dst,base,stride) \ + _tile_stored_internal (dst, base, stride) + +#define _tile_stored_internal(src,base,stride) \ + __asm__ volatile \ + ("{tilestored\t%%tmm"#src", (%0,%1,1)|tilestored\t[%0+%1*1], %%tmm"#src"}" \ + :: "r" ((void*) (base)), "r" ((__PTRDIFF_TYPE__) (stride)) \ + : "memory") + +#define _tile_zero(dst) \ + _tile_zero_internal (dst) + +#define _tile_zero_internal(dst) \ + __asm__ volatile \ + ("tilezero\t%%tmm"#dst ::) + +#endif + +#ifdef __DISABLE_AMX_TILE__ +#undef __DISABLE_AMX_TILE__ +#pragma GCC pop_options +#endif /* __DISABLE_AMX_TILE__ */ + +#endif /* _AMXTILEINTRIN_H_INCLUDED */ diff --git a/include-gcc/avx2intrin.h b/include-gcc/avx2intrin.h new file mode 100644 index 0000000..1b9c816 --- /dev/null +++ b/include-gcc/avx2intrin.h @@ -0,0 +1,1923 @@ +/* Copyright (C) 2011-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _AVX2INTRIN_H_INCLUDED +#define _AVX2INTRIN_H_INCLUDED + +#ifndef __AVX2__ +#pragma GCC push_options +#pragma GCC target("avx2") +#define __DISABLE_AVX2__ +#endif /* __AVX2__ */ + +/* Sum absolute 8-bit integer difference of adjacent groups of 4 + byte integers in the first 2 operands. Starting offsets within + operands are determined by the 3rd mask operand. */ +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mpsadbw_epu8 (__m256i __X, __m256i __Y, const int __M) +{ + return (__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)__X, + (__v32qi)__Y, __M); +} +#else +#define _mm256_mpsadbw_epu8(X, Y, M) \ + ((__m256i) __builtin_ia32_mpsadbw256 ((__v32qi)(__m256i)(X), \ + (__v32qi)(__m256i)(Y), (int)(M))) +#endif + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_abs_epi8 (__m256i __A) +{ + return (__m256i)__builtin_ia32_pabsb256 ((__v32qi)__A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_abs_epi16 (__m256i __A) +{ + return (__m256i)__builtin_ia32_pabsw256 ((__v16hi)__A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_abs_epi32 (__m256i __A) +{ + return (__m256i)__builtin_ia32_pabsd256 ((__v8si)__A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_packs_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_packssdw256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_packs_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_packsswb256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_packus_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_packusdw256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_packus_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_packuswb256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_add_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v32qu)__A + (__v32qu)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_add_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v16hu)__A + (__v16hu)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_add_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v8su)__A + (__v8su)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_add_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v4du)__A + (__v4du)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_adds_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_paddsb256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_adds_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_paddsw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_adds_epu8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_paddusb256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_adds_epu16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_paddusw256 ((__v16hi)__A, (__v16hi)__B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_alignr_epi8 (__m256i __A, __m256i __B, const int __N) +{ + return (__m256i) __builtin_ia32_palignr256 ((__v4di)__A, + (__v4di)__B, + __N * 8); +} +#else +/* In that case (__N*8) will be in vreg, and insn will not be matched. */ +/* Use define instead */ +#define _mm256_alignr_epi8(A, B, N) \ + ((__m256i) __builtin_ia32_palignr256 ((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), \ + (int)(N) * 8)) +#endif + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_and_si256 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v4du)__A & (__v4du)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_andnot_si256 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_andnotsi256 ((__v4di)__A, (__v4di)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_avg_epu8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pavgb256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_avg_epu16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pavgw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_blendv_epi8 (__m256i __X, __m256i __Y, __m256i __M) +{ + return (__m256i) __builtin_ia32_pblendvb256 ((__v32qi)__X, + (__v32qi)__Y, + (__v32qi)__M); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_blend_epi16 (__m256i __X, __m256i __Y, const int __M) +{ + return (__m256i) __builtin_ia32_pblendw256 ((__v16hi)__X, + (__v16hi)__Y, + __M); +} +#else +#define _mm256_blend_epi16(X, Y, M) \ + ((__m256i) __builtin_ia32_pblendw256 ((__v16hi)(__m256i)(X), \ + (__v16hi)(__m256i)(Y), (int)(M))) +#endif + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v32qi)__A == (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v16hi)__A == (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v8si)__A == (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v4di)__A == (__v4di)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v32qs)__A > (__v32qs)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v16hi)__A > (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v8si)__A > (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v4di)__A > (__v4di)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hadd_epi16 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_phaddw256 ((__v16hi)__X, + (__v16hi)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hadd_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_phaddd256 ((__v8si)__X, (__v8si)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hadds_epi16 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_phaddsw256 ((__v16hi)__X, + (__v16hi)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hsub_epi16 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_phsubw256 ((__v16hi)__X, + (__v16hi)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hsub_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_phsubd256 ((__v8si)__X, (__v8si)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hsubs_epi16 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_phsubsw256 ((__v16hi)__X, + (__v16hi)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maddubs_epi16 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmaddubsw256 ((__v32qi)__X, + (__v32qi)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_madd_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmaddwd256 ((__v16hi)__A, + (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmaxsb256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmaxsw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmaxsd256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_epu8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmaxub256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_epu16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmaxuw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_epu32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmaxud256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pminsb256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pminsw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pminsd256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_epu8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pminub256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_epu16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_epu32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pminud256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_movemask_epi8 (__m256i __A) +{ + return __builtin_ia32_pmovmskb256 ((__v32qi)__A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi8_epi16 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovsxbw256 ((__v16qi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi8_epi32 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovsxbd256 ((__v16qi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi8_epi64 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovsxbq256 ((__v16qi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi16_epi32 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovsxwd256 ((__v8hi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi16_epi64 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovsxwq256 ((__v8hi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi32_epi64 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovsxdq256 ((__v4si)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu8_epi16 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovzxbw256 ((__v16qi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu8_epi32 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovzxbd256 ((__v16qi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu8_epi64 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovzxbq256 ((__v16qi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu16_epi32 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovzxwd256 ((__v8hi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu16_epi64 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovzxwq256 ((__v8hi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu32_epi64 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pmovzxdq256 ((__v4si)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mul_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmuldq256 ((__v8si)__X, (__v8si)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mulhrs_epi16 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmulhrsw256 ((__v16hi)__X, + (__v16hi)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mulhi_epu16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmulhuw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mulhi_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmulhw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mullo_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v16hu)__A * (__v16hu)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mullo_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v8su)__A * (__v8su)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mul_epu32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_pmuludq256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_or_si256 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v4du)__A | (__v4du)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sad_epu8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_psadbw256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shuffle_epi8 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_pshufb256 ((__v32qi)__X, + (__v32qi)__Y); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shuffle_epi32 (__m256i __A, const int __mask) +{ + return (__m256i)__builtin_ia32_pshufd256 ((__v8si)__A, __mask); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shufflehi_epi16 (__m256i __A, const int __mask) +{ + return (__m256i)__builtin_ia32_pshufhw256 ((__v16hi)__A, __mask); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shufflelo_epi16 (__m256i __A, const int __mask) +{ + return (__m256i)__builtin_ia32_pshuflw256 ((__v16hi)__A, __mask); +} +#else +#define _mm256_shuffle_epi32(A, N) \ + ((__m256i)__builtin_ia32_pshufd256 ((__v8si)(__m256i)(A), (int)(N))) +#define _mm256_shufflehi_epi16(A, N) \ + ((__m256i)__builtin_ia32_pshufhw256 ((__v16hi)(__m256i)(A), (int)(N))) +#define _mm256_shufflelo_epi16(A, N) \ + ((__m256i)__builtin_ia32_pshuflw256 ((__v16hi)(__m256i)(A), (int)(N))) +#endif + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sign_epi8 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psignb256 ((__v32qi)__X, (__v32qi)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sign_epi16 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psignw256 ((__v16hi)__X, (__v16hi)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sign_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psignd256 ((__v8si)__X, (__v8si)__Y); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_bslli_epi128 (__m256i __A, const int __N) +{ + return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_slli_si256 (__m256i __A, const int __N) +{ + return (__m256i)__builtin_ia32_pslldqi256 (__A, __N * 8); +} +#else +#define _mm256_bslli_epi128(A, N) \ + ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8)) +#define _mm256_slli_si256(A, N) \ + ((__m256i)__builtin_ia32_pslldqi256 ((__m256i)(A), (int)(N) * 8)) +#endif + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_slli_epi16 (__m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_psllwi256 ((__v16hi)__A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sll_epi16 (__m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_psllw256((__v16hi)__A, (__v8hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_slli_epi32 (__m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_pslldi256 ((__v8si)__A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sll_epi32 (__m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_pslld256((__v8si)__A, (__v4si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_slli_epi64 (__m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_psllqi256 ((__v4di)__A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sll_epi64 (__m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_psllq256((__v4di)__A, (__v2di)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srai_epi16 (__m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_psrawi256 ((__v16hi)__A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sra_epi16 (__m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_psraw256 ((__v16hi)__A, (__v8hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srai_epi32 (__m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_psradi256 ((__v8si)__A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sra_epi32 (__m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_psrad256 ((__v8si)__A, (__v4si)__B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_bsrli_epi128 (__m256i __A, const int __N) +{ + return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srli_si256 (__m256i __A, const int __N) +{ + return (__m256i)__builtin_ia32_psrldqi256 (__A, __N * 8); +} +#else +#define _mm256_bsrli_epi128(A, N) \ + ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8)) +#define _mm256_srli_si256(A, N) \ + ((__m256i)__builtin_ia32_psrldqi256 ((__m256i)(A), (int)(N) * 8)) +#endif + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srli_epi16 (__m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_psrlwi256 ((__v16hi)__A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srl_epi16 (__m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_psrlw256((__v16hi)__A, (__v8hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srli_epi32 (__m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_psrldi256 ((__v8si)__A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srl_epi32 (__m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_psrld256((__v8si)__A, (__v4si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srli_epi64 (__m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_psrlqi256 ((__v4di)__A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srl_epi64 (__m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_psrlq256((__v4di)__A, (__v2di)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sub_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v32qu)__A - (__v32qu)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sub_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v16hu)__A - (__v16hu)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sub_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v8su)__A - (__v8su)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sub_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v4du)__A - (__v4du)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_subs_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_psubsb256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_subs_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_psubsw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_subs_epu8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_psubusb256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_subs_epu16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_psubusw256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpackhi_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_punpckhbw256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpackhi_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_punpckhwd256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpackhi_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_punpckhdq256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpackhi_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_punpckhqdq256 ((__v4di)__A, (__v4di)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpacklo_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_punpcklbw256 ((__v32qi)__A, (__v32qi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpacklo_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_punpcklwd256 ((__v16hi)__A, (__v16hi)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpacklo_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_punpckldq256 ((__v8si)__A, (__v8si)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpacklo_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_punpcklqdq256 ((__v4di)__A, (__v4di)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_xor_si256 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v4du)__A ^ (__v4du)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_stream_load_si256 (__m256i const *__X) +{ + return (__m256i) __builtin_ia32_movntdqa256 ((__v4di *) __X); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcastss_ps (__m128 __X) +{ + return (__m128) __builtin_ia32_vbroadcastss_ps ((__v4sf)__X); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcastss_ps (__m128 __X) +{ + return (__m256) __builtin_ia32_vbroadcastss_ps256 ((__v4sf)__X); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcastsd_pd (__m128d __X) +{ + return (__m256d) __builtin_ia32_vbroadcastsd_pd256 ((__v2df)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcastsi128_si256 (__m128i __X) +{ + return (__m256i) __builtin_ia32_vbroadcastsi256 ((__v2di)__X); +} + +#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X) +#define _mm_broadcastsd_pd(X) _mm_movedup_pd(X) + +#ifdef __OPTIMIZE__ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blend_epi32 (__m128i __X, __m128i __Y, const int __M) +{ + return (__m128i) __builtin_ia32_pblendd128 ((__v4si)__X, + (__v4si)__Y, + __M); +} +#else +#define _mm_blend_epi32(X, Y, M) \ + ((__m128i) __builtin_ia32_pblendd128 ((__v4si)(__m128i)(X), \ + (__v4si)(__m128i)(Y), (int)(M))) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_blend_epi32 (__m256i __X, __m256i __Y, const int __M) +{ + return (__m256i) __builtin_ia32_pblendd256 ((__v8si)__X, + (__v8si)__Y, + __M); +} +#else +#define _mm256_blend_epi32(X, Y, M) \ + ((__m256i) __builtin_ia32_pblendd256 ((__v8si)(__m256i)(X), \ + (__v8si)(__m256i)(Y), (int)(M))) +#endif + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcastb_epi8 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pbroadcastb256 ((__v16qi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcastw_epi16 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pbroadcastw256 ((__v8hi)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcastd_epi32 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pbroadcastd256 ((__v4si)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcastq_epi64 (__m128i __X) +{ + return (__m256i) __builtin_ia32_pbroadcastq256 ((__v2di)__X); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcastb_epi8 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pbroadcastb128 ((__v16qi)__X); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcastw_epi16 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pbroadcastw128 ((__v8hi)__X); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcastd_epi32 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pbroadcastd128 ((__v4si)__X); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcastq_epi64 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pbroadcastq128 ((__v2di)__X); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutevar8x32_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_permvarsi256 ((__v8si)__X, (__v8si)__Y); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute4x64_pd (__m256d __X, const int __M) +{ + return (__m256d) __builtin_ia32_permdf256 ((__v4df)__X, __M); +} +#else +#define _mm256_permute4x64_pd(X, M) \ + ((__m256d) __builtin_ia32_permdf256 ((__v4df)(__m256d)(X), (int)(M))) +#endif + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutevar8x32_ps (__m256 __X, __m256i __Y) +{ + return (__m256) __builtin_ia32_permvarsf256 ((__v8sf)__X, (__v8si)__Y); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute4x64_epi64 (__m256i __X, const int __M) +{ + return (__m256i) __builtin_ia32_permdi256 ((__v4di)__X, __M); +} +#else +#define _mm256_permute4x64_epi64(X, M) \ + ((__m256i) __builtin_ia32_permdi256 ((__v4di)(__m256i)(X), (int)(M))) +#endif + + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute2x128_si256 (__m256i __X, __m256i __Y, const int __M) +{ + return (__m256i) __builtin_ia32_permti256 ((__v4di)__X, (__v4di)__Y, __M); +} +#else +#define _mm256_permute2x128_si256(X, Y, M) \ + ((__m256i) __builtin_ia32_permti256 ((__v4di)(__m256i)(X), (__v4di)(__m256i)(Y), (int)(M))) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extracti128_si256 (__m256i __X, const int __M) +{ + return (__m128i) __builtin_ia32_extract128i256 ((__v4di)__X, __M); +} +#else +#define _mm256_extracti128_si256(X, M) \ + ((__m128i) __builtin_ia32_extract128i256 ((__v4di)(__m256i)(X), (int)(M))) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_inserti128_si256 (__m256i __X, __m128i __Y, const int __M) +{ + return (__m256i) __builtin_ia32_insert128i256 ((__v4di)__X, (__v2di)__Y, __M); +} +#else +#define _mm256_inserti128_si256(X, Y, M) \ + ((__m256i) __builtin_ia32_insert128i256 ((__v4di)(__m256i)(X), \ + (__v2di)(__m128i)(Y), \ + (int)(M))) +#endif + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskload_epi32 (int const *__X, __m256i __M ) +{ + return (__m256i) __builtin_ia32_maskloadd256 ((const __v8si *)__X, + (__v8si)__M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskload_epi64 (long long const *__X, __m256i __M ) +{ + return (__m256i) __builtin_ia32_maskloadq256 ((const __v4di *)__X, + (__v4di)__M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskload_epi32 (int const *__X, __m128i __M ) +{ + return (__m128i) __builtin_ia32_maskloadd ((const __v4si *)__X, + (__v4si)__M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskload_epi64 (long long const *__X, __m128i __M ) +{ + return (__m128i) __builtin_ia32_maskloadq ((const __v2di *)__X, + (__v2di)__M); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskstore_epi32 (int *__X, __m256i __M, __m256i __Y ) +{ + __builtin_ia32_maskstored256 ((__v8si *)__X, (__v8si)__M, (__v8si)__Y); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskstore_epi64 (long long *__X, __m256i __M, __m256i __Y ) +{ + __builtin_ia32_maskstoreq256 ((__v4di *)__X, (__v4di)__M, (__v4di)__Y); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskstore_epi32 (int *__X, __m128i __M, __m128i __Y ) +{ + __builtin_ia32_maskstored ((__v4si *)__X, (__v4si)__M, (__v4si)__Y); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskstore_epi64 (long long *__X, __m128i __M, __m128i __Y ) +{ + __builtin_ia32_maskstoreq (( __v2di *)__X, (__v2di)__M, (__v2di)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sllv_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psllv8si ((__v8si)__X, (__v8si)__Y); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sllv_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psllv4si ((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sllv_epi64 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psllv4di ((__v4di)__X, (__v4di)__Y); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sllv_epi64 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psllv2di ((__v2di)__X, (__v2di)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srav_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psrav8si ((__v8si)__X, (__v8si)__Y); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srav_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psrav4si ((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srlv_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psrlv8si ((__v8si)__X, (__v8si)__Y); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srlv_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psrlv4si ((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srlv_epi64 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psrlv4di ((__v4di)__X, (__v4di)__Y); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srlv_epi64 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psrlv2di ((__v2di)__X, (__v2di)__Y); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i32gather_pd (double const *__base, __m128i __index, const int __scale) +{ + __v2df __zero = _mm_setzero_pd (); + __v2df __mask = _mm_cmpeq_pd (__zero, __zero); + + return (__m128d) __builtin_ia32_gathersiv2df (_mm_undefined_pd (), + __base, + (__v4si)__index, + __mask, + __scale); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i32gather_pd (__m128d __src, double const *__base, __m128i __index, + __m128d __mask, const int __scale) +{ + return (__m128d) __builtin_ia32_gathersiv2df ((__v2df)__src, + __base, + (__v4si)__index, + (__v2df)__mask, + __scale); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i32gather_pd (double const *__base, __m128i __index, const int __scale) +{ + __v4df __zero = _mm256_setzero_pd (); + __v4df __mask = _mm256_cmp_pd (__zero, __zero, _CMP_EQ_OQ); + + return (__m256d) __builtin_ia32_gathersiv4df (_mm256_undefined_pd (), + __base, + (__v4si)__index, + __mask, + __scale); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i32gather_pd (__m256d __src, double const *__base, + __m128i __index, __m256d __mask, const int __scale) +{ + return (__m256d) __builtin_ia32_gathersiv4df ((__v4df)__src, + __base, + (__v4si)__index, + (__v4df)__mask, + __scale); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i64gather_pd (double const *__base, __m128i __index, const int __scale) +{ + __v2df __src = _mm_setzero_pd (); + __v2df __mask = _mm_cmpeq_pd (__src, __src); + + return (__m128d) __builtin_ia32_gatherdiv2df (__src, + __base, + (__v2di)__index, + __mask, + __scale); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i64gather_pd (__m128d __src, double const *__base, __m128i __index, + __m128d __mask, const int __scale) +{ + return (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)__src, + __base, + (__v2di)__index, + (__v2df)__mask, + __scale); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i64gather_pd (double const *__base, __m256i __index, const int __scale) +{ + __v4df __src = _mm256_setzero_pd (); + __v4df __mask = _mm256_cmp_pd (__src, __src, _CMP_EQ_OQ); + + return (__m256d) __builtin_ia32_gatherdiv4df (__src, + __base, + (__v4di)__index, + __mask, + __scale); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i64gather_pd (__m256d __src, double const *__base, + __m256i __index, __m256d __mask, const int __scale) +{ + return (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)__src, + __base, + (__v4di)__index, + (__v4df)__mask, + __scale); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i32gather_ps (float const *__base, __m128i __index, const int __scale) +{ + __v4sf __src = _mm_setzero_ps (); + __v4sf __mask = _mm_cmpeq_ps (__src, __src); + + return (__m128) __builtin_ia32_gathersiv4sf (__src, + __base, + (__v4si)__index, + __mask, + __scale); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i32gather_ps (__m128 __src, float const *__base, __m128i __index, + __m128 __mask, const int __scale) +{ + return (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)__src, + __base, + (__v4si)__index, + (__v4sf)__mask, + __scale); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i32gather_ps (float const *__base, __m256i __index, const int __scale) +{ + __v8sf __src = _mm256_setzero_ps (); + __v8sf __mask = _mm256_cmp_ps (__src, __src, _CMP_EQ_OQ); + + return (__m256) __builtin_ia32_gathersiv8sf (__src, + __base, + (__v8si)__index, + __mask, + __scale); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i32gather_ps (__m256 __src, float const *__base, + __m256i __index, __m256 __mask, const int __scale) +{ + return (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)__src, + __base, + (__v8si)__index, + (__v8sf)__mask, + __scale); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i64gather_ps (float const *__base, __m128i __index, const int __scale) +{ + __v4sf __src = _mm_setzero_ps (); + __v4sf __mask = _mm_cmpeq_ps (__src, __src); + + return (__m128) __builtin_ia32_gatherdiv4sf (__src, + __base, + (__v2di)__index, + __mask, + __scale); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i64gather_ps (__m128 __src, float const *__base, __m128i __index, + __m128 __mask, const int __scale) +{ + return (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)__src, + __base, + (__v2di)__index, + (__v4sf)__mask, + __scale); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i64gather_ps (float const *__base, __m256i __index, const int __scale) +{ + __v4sf __src = _mm_setzero_ps (); + __v4sf __mask = _mm_cmpeq_ps (__src, __src); + + return (__m128) __builtin_ia32_gatherdiv4sf256 (__src, + __base, + (__v4di)__index, + __mask, + __scale); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i64gather_ps (__m128 __src, float const *__base, + __m256i __index, __m128 __mask, const int __scale) +{ + return (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)__src, + __base, + (__v4di)__index, + (__v4sf)__mask, + __scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i32gather_epi64 (long long int const *__base, + __m128i __index, const int __scale) +{ + __v2di __src = __extension__ (__v2di){ 0, 0 }; + __v2di __mask = __extension__ (__v2di){ ~0, ~0 }; + + return (__m128i) __builtin_ia32_gathersiv2di (__src, + __base, + (__v4si)__index, + __mask, + __scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i32gather_epi64 (__m128i __src, long long int const *__base, + __m128i __index, __m128i __mask, const int __scale) +{ + return (__m128i) __builtin_ia32_gathersiv2di ((__v2di)__src, + __base, + (__v4si)__index, + (__v2di)__mask, + __scale); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i32gather_epi64 (long long int const *__base, + __m128i __index, const int __scale) +{ + __v4di __src = __extension__ (__v4di){ 0, 0, 0, 0 }; + __v4di __mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 }; + + return (__m256i) __builtin_ia32_gathersiv4di (__src, + __base, + (__v4si)__index, + __mask, + __scale); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i32gather_epi64 (__m256i __src, long long int const *__base, + __m128i __index, __m256i __mask, + const int __scale) +{ + return (__m256i) __builtin_ia32_gathersiv4di ((__v4di)__src, + __base, + (__v4si)__index, + (__v4di)__mask, + __scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i64gather_epi64 (long long int const *__base, + __m128i __index, const int __scale) +{ + __v2di __src = __extension__ (__v2di){ 0, 0 }; + __v2di __mask = __extension__ (__v2di){ ~0, ~0 }; + + return (__m128i) __builtin_ia32_gatherdiv2di (__src, + __base, + (__v2di)__index, + __mask, + __scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i64gather_epi64 (__m128i __src, long long int const *__base, + __m128i __index, __m128i __mask, const int __scale) +{ + return (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)__src, + __base, + (__v2di)__index, + (__v2di)__mask, + __scale); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i64gather_epi64 (long long int const *__base, + __m256i __index, const int __scale) +{ + __v4di __src = __extension__ (__v4di){ 0, 0, 0, 0 }; + __v4di __mask = __extension__ (__v4di){ ~0, ~0, ~0, ~0 }; + + return (__m256i) __builtin_ia32_gatherdiv4di (__src, + __base, + (__v4di)__index, + __mask, + __scale); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i64gather_epi64 (__m256i __src, long long int const *__base, + __m256i __index, __m256i __mask, + const int __scale) +{ + return (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)__src, + __base, + (__v4di)__index, + (__v4di)__mask, + __scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i32gather_epi32 (int const *__base, __m128i __index, const int __scale) +{ + __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 }; + __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; + + return (__m128i) __builtin_ia32_gathersiv4si (__src, + __base, + (__v4si)__index, + __mask, + __scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i32gather_epi32 (__m128i __src, int const *__base, __m128i __index, + __m128i __mask, const int __scale) +{ + return (__m128i) __builtin_ia32_gathersiv4si ((__v4si)__src, + __base, + (__v4si)__index, + (__v4si)__mask, + __scale); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i32gather_epi32 (int const *__base, __m256i __index, const int __scale) +{ + __v8si __src = __extension__ (__v8si){ 0, 0, 0, 0, 0, 0, 0, 0 }; + __v8si __mask = __extension__ (__v8si){ ~0, ~0, ~0, ~0, ~0, ~0, ~0, ~0 }; + + return (__m256i) __builtin_ia32_gathersiv8si (__src, + __base, + (__v8si)__index, + __mask, + __scale); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i32gather_epi32 (__m256i __src, int const *__base, + __m256i __index, __m256i __mask, + const int __scale) +{ + return (__m256i) __builtin_ia32_gathersiv8si ((__v8si)__src, + __base, + (__v8si)__index, + (__v8si)__mask, + __scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i64gather_epi32 (int const *__base, __m128i __index, const int __scale) +{ + __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 }; + __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; + + return (__m128i) __builtin_ia32_gatherdiv4si (__src, + __base, + (__v2di)__index, + __mask, + __scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i64gather_epi32 (__m128i __src, int const *__base, __m128i __index, + __m128i __mask, const int __scale) +{ + return (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)__src, + __base, + (__v2di)__index, + (__v4si)__mask, + __scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i64gather_epi32 (int const *__base, __m256i __index, const int __scale) +{ + __v4si __src = __extension__ (__v4si){ 0, 0, 0, 0 }; + __v4si __mask = __extension__ (__v4si){ ~0, ~0, ~0, ~0 }; + + return (__m128i) __builtin_ia32_gatherdiv4si256 (__src, + __base, + (__v4di)__index, + __mask, + __scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i64gather_epi32 (__m128i __src, int const *__base, + __m256i __index, __m128i __mask, + const int __scale) +{ + return (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)__src, + __base, + (__v4di)__index, + (__v4si)__mask, + __scale); +} +#else /* __OPTIMIZE__ */ +#define _mm_i32gather_pd(BASE, INDEX, SCALE) \ + (__m128d) __builtin_ia32_gathersiv2df ((__v2df) _mm_setzero_pd (), \ + (double const *) (BASE), \ + (__v4si)(__m128i) (INDEX), \ + (__v2df) \ + _mm_cmpeq_pd (_mm_setzero_pd (),\ + _mm_setzero_pd ()),\ + (int) (SCALE)) + +#define _mm_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128d) __builtin_ia32_gathersiv2df ((__v2df)(__m128d) (SRC), \ + (double const *) (BASE), \ + (__v4si)(__m128i) (INDEX), \ + (__v2df)(__m128d) (MASK), \ + (int) (SCALE)) + +#define _mm256_i32gather_pd(BASE, INDEX, SCALE) \ + (__m256d) __builtin_ia32_gathersiv4df ((__v4df) _mm256_setzero_pd (), \ + (double const *) (BASE), \ + (__v4si)(__m128i) (INDEX), \ + (__v4df) \ + _mm256_cmp_pd (_mm256_setzero_pd (),\ + _mm256_setzero_pd (),\ + _CMP_EQ_OQ), \ + (int) (SCALE)) + +#define _mm256_mask_i32gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ + (__m256d) __builtin_ia32_gathersiv4df ((__v4df)(__m256d) (SRC), \ + (double const *) (BASE), \ + (__v4si)(__m128i) (INDEX), \ + (__v4df)(__m256d) (MASK), \ + (int) (SCALE)) + +#define _mm_i64gather_pd(BASE, INDEX, SCALE) \ + (__m128d) __builtin_ia32_gatherdiv2df ((__v2df) _mm_setzero_pd (), \ + (double const *) (BASE), \ + (__v2di)(__m128i) (INDEX), \ + (__v2df) \ + _mm_cmpeq_pd (_mm_setzero_pd (),\ + _mm_setzero_pd ()),\ + (int) (SCALE)) + +#define _mm_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128d) __builtin_ia32_gatherdiv2df ((__v2df)(__m128d) (SRC), \ + (double const *) (BASE), \ + (__v2di)(__m128i) (INDEX), \ + (__v2df)(__m128d) (MASK), \ + (int) (SCALE)) + +#define _mm256_i64gather_pd(BASE, INDEX, SCALE) \ + (__m256d) __builtin_ia32_gatherdiv4df ((__v4df) _mm256_setzero_pd (), \ + (double const *) (BASE), \ + (__v4di)(__m256i) (INDEX), \ + (__v4df) \ + _mm256_cmp_pd (_mm256_setzero_pd (),\ + _mm256_setzero_pd (),\ + _CMP_EQ_OQ), \ + (int) (SCALE)) + +#define _mm256_mask_i64gather_pd(SRC, BASE, INDEX, MASK, SCALE) \ + (__m256d) __builtin_ia32_gatherdiv4df ((__v4df)(__m256d) (SRC), \ + (double const *) (BASE), \ + (__v4di)(__m256i) (INDEX), \ + (__v4df)(__m256d) (MASK), \ + (int) (SCALE)) + +#define _mm_i32gather_ps(BASE, INDEX, SCALE) \ + (__m128) __builtin_ia32_gathersiv4sf ((__v4sf) _mm_setzero_ps (), \ + (float const *) (BASE), \ + (__v4si)(__m128i) (INDEX), \ + (__v4sf) \ + _mm_cmpeq_ps (_mm_setzero_ps (),\ + _mm_setzero_ps ()),\ + (int) (SCALE)) + +#define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128) (SRC), \ + (float const *) (BASE), \ + (__v4si)(__m128i) (INDEX), \ + (__v4sf)(__m128) (MASK), \ + (int) (SCALE)) + +#define _mm256_i32gather_ps(BASE, INDEX, SCALE) \ + (__m256) __builtin_ia32_gathersiv8sf ((__v8sf) _mm256_setzero_ps (), \ + (float const *) (BASE), \ + (__v8si)(__m256i) (INDEX), \ + (__v8sf) \ + _mm256_cmp_ps (_mm256_setzero_ps (),\ + _mm256_setzero_ps (),\ + _CMP_EQ_OQ), \ + (int) (SCALE)) + +#define _mm256_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ + (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256) (SRC), \ + (float const *) (BASE), \ + (__v8si)(__m256i) (INDEX), \ + (__v8sf)(__m256) (MASK), \ + (int) (SCALE)) + +#define _mm_i64gather_ps(BASE, INDEX, SCALE) \ + (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf) _mm_setzero_pd (), \ + (float const *) (BASE), \ + (__v2di)(__m128i) (INDEX), \ + (__v4sf) \ + _mm_cmpeq_ps (_mm_setzero_ps (),\ + _mm_setzero_ps ()),\ + (int) (SCALE)) + +#define _mm_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128) (SRC), \ + (float const *) (BASE), \ + (__v2di)(__m128i) (INDEX), \ + (__v4sf)(__m128) (MASK), \ + (int) (SCALE)) + +#define _mm256_i64gather_ps(BASE, INDEX, SCALE) \ + (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf) _mm_setzero_ps (), \ + (float const *) (BASE), \ + (__v4di)(__m256i) (INDEX), \ + (__v4sf) \ + _mm_cmpeq_ps (_mm_setzero_ps (),\ + _mm_setzero_ps ()),\ + (int) (SCALE)) + +#define _mm256_mask_i64gather_ps(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128) __builtin_ia32_gatherdiv4sf256 ((__v4sf)(__m128) (SRC), \ + (float const *) (BASE), \ + (__v4di)(__m256i) (INDEX), \ + (__v4sf)(__m128) (MASK), \ + (int) (SCALE)) + +#define _mm_i32gather_epi64(BASE, INDEX, SCALE) \ + (__m128i) __builtin_ia32_gathersiv2di ((__v2di) _mm_setzero_si128 (), \ + (long long const *) (BASE), \ + (__v4si)(__m128i) (INDEX), \ + (__v2di)_mm_set1_epi64x (-1), \ + (int) (SCALE)) + +#define _mm_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128i) __builtin_ia32_gathersiv2di ((__v2di)(__m128i) (SRC), \ + (long long const *) (BASE), \ + (__v4si)(__m128i) (INDEX), \ + (__v2di)(__m128i) (MASK), \ + (int) (SCALE)) + +#define _mm256_i32gather_epi64(BASE, INDEX, SCALE) \ + (__m256i) __builtin_ia32_gathersiv4di ((__v4di) _mm256_setzero_si256 (), \ + (long long const *) (BASE), \ + (__v4si)(__m128i) (INDEX), \ + (__v4di)_mm256_set1_epi64x (-1), \ + (int) (SCALE)) + +#define _mm256_mask_i32gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ + (__m256i) __builtin_ia32_gathersiv4di ((__v4di)(__m256i) (SRC), \ + (long long const *) (BASE), \ + (__v4si)(__m128i) (INDEX), \ + (__v4di)(__m256i) (MASK), \ + (int) (SCALE)) + +#define _mm_i64gather_epi64(BASE, INDEX, SCALE) \ + (__m128i) __builtin_ia32_gatherdiv2di ((__v2di) _mm_setzero_si128 (), \ + (long long const *) (BASE), \ + (__v2di)(__m128i) (INDEX), \ + (__v2di)_mm_set1_epi64x (-1), \ + (int) (SCALE)) + +#define _mm_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128i) __builtin_ia32_gatherdiv2di ((__v2di)(__m128i) (SRC), \ + (long long const *) (BASE), \ + (__v2di)(__m128i) (INDEX), \ + (__v2di)(__m128i) (MASK), \ + (int) (SCALE)) + +#define _mm256_i64gather_epi64(BASE, INDEX, SCALE) \ + (__m256i) __builtin_ia32_gatherdiv4di ((__v4di) _mm256_setzero_si256 (), \ + (long long const *) (BASE), \ + (__v4di)(__m256i) (INDEX), \ + (__v4di)_mm256_set1_epi64x (-1), \ + (int) (SCALE)) + +#define _mm256_mask_i64gather_epi64(SRC, BASE, INDEX, MASK, SCALE) \ + (__m256i) __builtin_ia32_gatherdiv4di ((__v4di)(__m256i) (SRC), \ + (long long const *) (BASE), \ + (__v4di)(__m256i) (INDEX), \ + (__v4di)(__m256i) (MASK), \ + (int) (SCALE)) + +#define _mm_i32gather_epi32(BASE, INDEX, SCALE) \ + (__m128i) __builtin_ia32_gathersiv4si ((__v4si) _mm_setzero_si128 (), \ + (int const *) (BASE), \ + (__v4si)(__m128i) (INDEX), \ + (__v4si)_mm_set1_epi32 (-1), \ + (int) (SCALE)) + +#define _mm_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128i) __builtin_ia32_gathersiv4si ((__v4si)(__m128i) (SRC), \ + (int const *) (BASE), \ + (__v4si)(__m128i) (INDEX), \ + (__v4si)(__m128i) (MASK), \ + (int) (SCALE)) + +#define _mm256_i32gather_epi32(BASE, INDEX, SCALE) \ + (__m256i) __builtin_ia32_gathersiv8si ((__v8si) _mm256_setzero_si256 (), \ + (int const *) (BASE), \ + (__v8si)(__m256i) (INDEX), \ + (__v8si)_mm256_set1_epi32 (-1), \ + (int) (SCALE)) + +#define _mm256_mask_i32gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ + (__m256i) __builtin_ia32_gathersiv8si ((__v8si)(__m256i) (SRC), \ + (int const *) (BASE), \ + (__v8si)(__m256i) (INDEX), \ + (__v8si)(__m256i) (MASK), \ + (int) (SCALE)) + +#define _mm_i64gather_epi32(BASE, INDEX, SCALE) \ + (__m128i) __builtin_ia32_gatherdiv4si ((__v4si) _mm_setzero_si128 (), \ + (int const *) (BASE), \ + (__v2di)(__m128i) (INDEX), \ + (__v4si)_mm_set1_epi32 (-1), \ + (int) (SCALE)) + +#define _mm_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128i) __builtin_ia32_gatherdiv4si ((__v4si)(__m128i) (SRC), \ + (int const *) (BASE), \ + (__v2di)(__m128i) (INDEX), \ + (__v4si)(__m128i) (MASK), \ + (int) (SCALE)) + +#define _mm256_i64gather_epi32(BASE, INDEX, SCALE) \ + (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si) _mm_setzero_si128 (), \ + (int const *) (BASE), \ + (__v4di)(__m256i) (INDEX), \ + (__v4si)_mm_set1_epi32(-1), \ + (int) (SCALE)) + +#define _mm256_mask_i64gather_epi32(SRC, BASE, INDEX, MASK, SCALE) \ + (__m128i) __builtin_ia32_gatherdiv4si256 ((__v4si)(__m128i) (SRC), \ + (int const *) (BASE), \ + (__v4di)(__m256i) (INDEX), \ + (__v4si)(__m128i) (MASK), \ + (int) (SCALE)) +#endif /* __OPTIMIZE__ */ + +#ifdef __DISABLE_AVX2__ +#undef __DISABLE_AVX2__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX2__ */ + +#endif /* _AVX2INTRIN_H_INCLUDED */ diff --git a/include-gcc/avx5124fmapsintrin.h b/include-gcc/avx5124fmapsintrin.h new file mode 100644 index 0000000..97dd77c --- /dev/null +++ b/include-gcc/avx5124fmapsintrin.h @@ -0,0 +1,216 @@ +/* Copyright (C) 2015-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#if !defined _IMMINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _AVX5124FMAPSINTRIN_H_INCLUDED +#define _AVX5124FMAPSINTRIN_H_INCLUDED + +#ifndef __AVX5124FMAPS__ +#pragma GCC push_options +#pragma GCC target("avx5124fmaps") +#define __DISABLE_AVX5124FMAPS__ +#endif /* __AVX5124FMAPS__ */ + +extern __inline __m512 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_4fmadd_ps (__m512 __A, __m512 __B, __m512 __C, + __m512 __D, __m512 __E, __m128 *__F) +{ + return (__m512) __builtin_ia32_4fmaddps ((__v16sf) __B, + (__v16sf) __C, + (__v16sf) __D, + (__v16sf) __E, + (__v16sf) __A, + (const __v4sf *) __F); +} + +extern __inline __m512 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_4fmadd_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, __m512 __D, __m512 __E, __m128 *__F) +{ + return (__m512) __builtin_ia32_4fmaddps_mask ((__v16sf) __B, + (__v16sf) __C, + (__v16sf) __D, + (__v16sf) __E, + (__v16sf) __A, + (const __v4sf *) __F, + (__v16sf) __A, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_4fmadd_ps (__mmask16 __U, + __m512 __A, __m512 __B, __m512 __C, + __m512 __D, __m512 __E, __m128 *__F) +{ + return (__m512) __builtin_ia32_4fmaddps_mask ((__v16sf) __B, + (__v16sf) __C, + (__v16sf) __D, + (__v16sf) __E, + (__v16sf) __A, + (const __v4sf *) __F, + (__v16sf) _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_4fmadd_ss (__m128 __A, __m128 __B, __m128 __C, + __m128 __D, __m128 __E, __m128 *__F) +{ + return (__m128) __builtin_ia32_4fmaddss ((__v4sf) __B, + (__v4sf) __C, + (__v4sf) __D, + (__v4sf) __E, + (__v4sf) __A, + (const __v4sf *) __F); +} + +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_4fmadd_ss (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C, + __m128 __D, __m128 __E, __m128 *__F) +{ + return (__m128) __builtin_ia32_4fmaddss_mask ((__v4sf) __B, + (__v4sf) __C, + (__v4sf) __D, + (__v4sf) __E, + (__v4sf) __A, + (const __v4sf *) __F, + (__v4sf) __A, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_4fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C, + __m128 __D, __m128 __E, __m128 *__F) +{ + return (__m128) __builtin_ia32_4fmaddss_mask ((__v4sf) __B, + (__v4sf) __C, + (__v4sf) __D, + (__v4sf) __E, + (__v4sf) __A, + (const __v4sf *) __F, + (__v4sf) _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_4fnmadd_ps (__m512 __A, __m512 __B, __m512 __C, + __m512 __D, __m512 __E, __m128 *__F) +{ + return (__m512) __builtin_ia32_4fnmaddps ((__v16sf) __B, + (__v16sf) __C, + (__v16sf) __D, + (__v16sf) __E, + (__v16sf) __A, + (const __v4sf *) __F); +} + +extern __inline __m512 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_4fnmadd_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, __m512 __D, __m512 __E, __m128 *__F) +{ + return (__m512) __builtin_ia32_4fnmaddps_mask ((__v16sf) __B, + (__v16sf) __C, + (__v16sf) __D, + (__v16sf) __E, + (__v16sf) __A, + (const __v4sf *) __F, + (__v16sf) __A, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_4fnmadd_ps (__mmask16 __U, + __m512 __A, __m512 __B, __m512 __C, + __m512 __D, __m512 __E, __m128 *__F) +{ + return (__m512) __builtin_ia32_4fnmaddps_mask ((__v16sf) __B, + (__v16sf) __C, + (__v16sf) __D, + (__v16sf) __E, + (__v16sf) __A, + (const __v4sf *) __F, + (__v16sf) _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_4fnmadd_ss (__m128 __A, __m128 __B, __m128 __C, + __m128 __D, __m128 __E, __m128 *__F) +{ + return (__m128) __builtin_ia32_4fnmaddss ((__v4sf) __B, + (__v4sf) __C, + (__v4sf) __D, + (__v4sf) __E, + (__v4sf) __A, + (const __v4sf *) __F); +} + +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_4fnmadd_ss (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C, + __m128 __D, __m128 __E, __m128 *__F) +{ + return (__m128) __builtin_ia32_4fnmaddss_mask ((__v4sf) __B, + (__v4sf) __C, + (__v4sf) __D, + (__v4sf) __E, + (__v4sf) __A, + (const __v4sf *) __F, + (__v4sf) __A, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_4fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C, + __m128 __D, __m128 __E, __m128 *__F) +{ + return (__m128) __builtin_ia32_4fnmaddss_mask ((__v4sf) __B, + (__v4sf) __C, + (__v4sf) __D, + (__v4sf) __E, + (__v4sf) __A, + (const __v4sf *) __F, + (__v4sf) _mm_setzero_ps (), + (__mmask8) __U); +} + +#ifdef __DISABLE_AVX5124FMAPS__ +#undef __DISABLE_AVX5124FMAPS__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX5124FMAPS__ */ + +#endif /* _AVX5124FMAPSINTRIN_H_INCLUDED */ diff --git a/include-gcc/avx5124vnniwintrin.h b/include-gcc/avx5124vnniwintrin.h new file mode 100644 index 0000000..fd12958 --- /dev/null +++ b/include-gcc/avx5124vnniwintrin.h @@ -0,0 +1,132 @@ +/* Copyright (C) 2015-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#if !defined _IMMINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _AVX5124VNNIWINTRIN_H_INCLUDED +#define _AVX5124VNNIWINTRIN_H_INCLUDED + +#ifndef __AVX5124VNNIW__ +#pragma GCC push_options +#pragma GCC target("avx5124vnniw") +#define __DISABLE_AVX5124VNNIW__ +#endif /* __AVX5124VNNIW__ */ + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_4dpwssd_epi32 (__m512i __A, __m512i __B, __m512i __C, + __m512i __D, __m512i __E, __m128i *__F) +{ + return (__m512i) __builtin_ia32_vp4dpwssd ((__v16si) __B, + (__v16si) __C, + (__v16si) __D, + (__v16si) __E, + (__v16si) __A, + (const __v4si *) __F); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_4dpwssd_epi32 (__m512i __A, __mmask16 __U, __m512i __B, + __m512i __C, __m512i __D, __m512i __E, + __m128i *__F) +{ + return (__m512i) __builtin_ia32_vp4dpwssd_mask ((__v16si) __B, + (__v16si) __C, + (__v16si) __D, + (__v16si) __E, + (__v16si) __A, + (const __v4si *) __F, + (__v16si) __A, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_4dpwssd_epi32 (__mmask16 __U, __m512i __A, __m512i __B, + __m512i __C, __m512i __D, __m512i __E, + __m128i *__F) +{ + return (__m512i) __builtin_ia32_vp4dpwssd_mask ((__v16si) __B, + (__v16si) __C, + (__v16si) __D, + (__v16si) __E, + (__v16si) __A, + (const __v4si *) __F, + (__v16si) _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_4dpwssds_epi32 (__m512i __A, __m512i __B, __m512i __C, + __m512i __D, __m512i __E, __m128i *__F) +{ + return (__m512i) __builtin_ia32_vp4dpwssds ((__v16si) __B, + (__v16si) __C, + (__v16si) __D, + (__v16si) __E, + (__v16si) __A, + (const __v4si *) __F); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_4dpwssds_epi32 (__m512i __A, __mmask16 __U, __m512i __B, + __m512i __C, __m512i __D, __m512i __E, + __m128i *__F) +{ + return (__m512i) __builtin_ia32_vp4dpwssds_mask ((__v16si) __B, + (__v16si) __C, + (__v16si) __D, + (__v16si) __E, + (__v16si) __A, + (const __v4si *) __F, + (__v16si) __A, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_4dpwssds_epi32 (__mmask16 __U, __m512i __A, __m512i __B, + __m512i __C, __m512i __D, __m512i __E, + __m128i *__F) +{ + return (__m512i) __builtin_ia32_vp4dpwssds_mask ((__v16si) __B, + (__v16si) __C, + (__v16si) __D, + (__v16si) __E, + (__v16si) __A, + (const __v4si *) __F, + (__v16si) _mm512_setzero_ps (), + (__mmask16) __U); +} + +#ifdef __DISABLE_AVX5124VNNIW__ +#undef __DISABLE_AVX5124VNNIW__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX5124VNNIW__ */ + +#endif /* _AVX5124VNNIWINTRIN_H_INCLUDED */ diff --git a/include-gcc/avx512bf16intrin.h b/include-gcc/avx512bf16intrin.h new file mode 100644 index 0000000..107f4a4 --- /dev/null +++ b/include-gcc/avx512bf16intrin.h @@ -0,0 +1,152 @@ +/* Copyright (C) 2019-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512BF16INTRIN_H_INCLUDED +#define _AVX512BF16INTRIN_H_INCLUDED + +#ifndef __AVX512BF16__ +#pragma GCC push_options +#pragma GCC target("avx512bf16") +#define __DISABLE_AVX512BF16__ +#endif /* __AVX512BF16__ */ + +/* Internal data types for implementing the intrinsics. */ +typedef __bf16 __v32bf __attribute__ ((__vector_size__ (64))); + +/* The Intel API is flexible enough that we must allow aliasing with other + vector types, and their scalar components. */ +typedef __bf16 __m512bh __attribute__ ((__vector_size__ (64), __may_alias__)); + +/* Convert One BF16 Data to One Single Float Data. */ +extern __inline float +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsbh_ss (__bf16 __A) +{ + return __builtin_ia32_cvtbf2sf (__A); +} + +/* vcvtne2ps2bf16 */ + +extern __inline __m512bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtne2ps_pbh (__m512 __A, __m512 __B) +{ + return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf(__A, __B); +} + +extern __inline __m512bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtne2ps_pbh (__m512bh __A, __mmask32 __B, __m512 __C, __m512 __D) +{ + return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf_mask(__C, __D, __A, __B); +} + +extern __inline __m512bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtne2ps_pbh (__mmask32 __A, __m512 __B, __m512 __C) +{ + return (__m512bh)__builtin_ia32_cvtne2ps2bf16_v32bf_maskz(__B, __C, __A); +} + +/* vcvtneps2bf16 */ + +extern __inline __m256bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtneps_pbh (__m512 __A) +{ + return (__m256bh)__builtin_ia32_cvtneps2bf16_v16sf(__A); +} + +extern __inline __m256bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtneps_pbh (__m256bh __A, __mmask16 __B, __m512 __C) +{ + return (__m256bh)__builtin_ia32_cvtneps2bf16_v16sf_mask(__C, __A, __B); +} + +extern __inline __m256bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtneps_pbh (__mmask16 __A, __m512 __B) +{ + return (__m256bh)__builtin_ia32_cvtneps2bf16_v16sf_maskz(__B, __A); +} + +/* vdpbf16ps */ + +extern __inline __m512 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_dpbf16_ps (__m512 __A, __m512bh __B, __m512bh __C) +{ + return (__m512)__builtin_ia32_dpbf16ps_v16sf(__A, __B, __C); +} + +extern __inline __m512 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_dpbf16_ps (__m512 __A, __mmask16 __B, __m512bh __C, __m512bh __D) +{ + return (__m512)__builtin_ia32_dpbf16ps_v16sf_mask(__A, __C, __D, __B); +} + +extern __inline __m512 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_dpbf16_ps (__mmask16 __A, __m512 __B, __m512bh __C, __m512bh __D) +{ + return (__m512)__builtin_ia32_dpbf16ps_v16sf_maskz(__B, __C, __D, __A); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtpbh_ps (__m256bh __A) +{ + return (__m512)_mm512_castsi512_ps ((__m512i)_mm512_slli_epi32 ( + (__m512i)_mm512_cvtepi16_epi32 ((__m256i)__A), 16)); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtpbh_ps (__mmask16 __U, __m256bh __A) +{ + return (__m512)_mm512_castsi512_ps ((__m512i) _mm512_slli_epi32 ( + (__m512i)_mm512_maskz_cvtepi16_epi32 ( + (__mmask16)__U, (__m256i)__A), 16)); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtpbh_ps (__m512 __S, __mmask16 __U, __m256bh __A) +{ + return (__m512)_mm512_castsi512_ps ((__m512i)(_mm512_mask_slli_epi32 ( + (__m512i)__S, (__mmask16)__U, + (__m512i)_mm512_cvtepi16_epi32 ((__m256i)__A), 16))); +} + +#ifdef __DISABLE_AVX512BF16__ +#undef __DISABLE_AVX512BF16__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512BF16__ */ + +#endif /* _AVX512BF16INTRIN_H_INCLUDED */ diff --git a/include-gcc/avx512bf16vlintrin.h b/include-gcc/avx512bf16vlintrin.h new file mode 100644 index 0000000..6e8a6a0 --- /dev/null +++ b/include-gcc/avx512bf16vlintrin.h @@ -0,0 +1,238 @@ +/* Copyright (C) 2019-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512BF16VLINTRIN_H_INCLUDED +#define _AVX512BF16VLINTRIN_H_INCLUDED + +#if !defined(__AVX512VL__) || !defined(__AVX512BF16__) +#pragma GCC push_options +#pragma GCC target("avx512bf16,avx512vl") +#define __DISABLE_AVX512BF16VL__ +#endif /* __AVX512BF16__ */ + +/* Internal data types for implementing the intrinsics. */ +typedef __bf16 __v16bf __attribute__ ((__vector_size__ (32))); +typedef __bf16 __v8bf __attribute__ ((__vector_size__ (16))); + +/* The Intel API is flexible enough that we must allow aliasing with other + vector types, and their scalar components. */ +typedef __bf16 __m256bh __attribute__ ((__vector_size__ (32), __may_alias__)); +typedef __bf16 __m128bh __attribute__ ((__vector_size__ (16), __may_alias__)); + +typedef __bf16 __bfloat16; + +#define _mm256_cvtneps_pbh(A) \ + (__m128bh) __builtin_ia32_cvtneps2bf16_v8sf (A) +#define _mm_cvtneps_pbh(A) \ + (__m128bh) __builtin_ia32_cvtneps2bf16_v4sf (A) + +/* vcvtne2ps2bf16 */ + +extern __inline __m256bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtne2ps_pbh (__m256 __A, __m256 __B) +{ + return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf(__A, __B); +} + +extern __inline __m256bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtne2ps_pbh (__m256bh __A, __mmask16 __B, __m256 __C, __m256 __D) +{ + return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf_mask(__C, __D, __A, __B); +} + +extern __inline __m256bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtne2ps_pbh (__mmask16 __A, __m256 __B, __m256 __C) +{ + return (__m256bh)__builtin_ia32_cvtne2ps2bf16_v16bf_maskz(__B, __C, __A); +} + +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtne2ps_pbh (__m128 __A, __m128 __B) +{ + return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf(__A, __B); +} + +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtne2ps_pbh (__m128bh __A, __mmask8 __B, __m128 __C, __m128 __D) +{ + return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf_mask(__C, __D, __A, __B); +} + +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtne2ps_pbh (__mmask8 __A, __m128 __B, __m128 __C) +{ + return (__m128bh)__builtin_ia32_cvtne2ps2bf16_v8bf_maskz(__B, __C, __A); +} + +/* vcvtneps2bf16 */ + +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtneps_pbh (__m128bh __A, __mmask8 __B, __m256 __C) +{ + return (__m128bh)__builtin_ia32_cvtneps2bf16_v8sf_mask(__C, __A, __B); +} + +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtneps_pbh (__mmask8 __A, __m256 __B) +{ + return (__m128bh)__builtin_ia32_cvtneps2bf16_v8sf_maskz(__B, __A); +} + +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtneps_pbh (__m128bh __A, __mmask8 __B, __m128 __C) +{ + return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf_mask(__C, __A, __B); +} + +extern __inline __m128bh +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtneps_pbh (__mmask8 __A, __m128 __B) +{ + return (__m128bh)__builtin_ia32_cvtneps2bf16_v4sf_maskz(__B, __A); +} + +/* vdpbf16ps */ + +extern __inline __m256 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_dpbf16_ps (__m256 __A, __m256bh __B, __m256bh __C) +{ + return (__m256)__builtin_ia32_dpbf16ps_v8sf(__A, __B, __C); +} + +extern __inline __m256 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_dpbf16_ps (__m256 __A, __mmask8 __B, __m256bh __C, __m256bh __D) +{ + return (__m256)__builtin_ia32_dpbf16ps_v8sf_mask(__A, __C, __D, __B); +} + +extern __inline __m256 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_dpbf16_ps (__mmask8 __A, __m256 __B, __m256bh __C, __m256bh __D) +{ + return (__m256)__builtin_ia32_dpbf16ps_v8sf_maskz(__B, __C, __D, __A); +} + +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_dpbf16_ps (__m128 __A, __m128bh __B, __m128bh __C) +{ + return (__m128)__builtin_ia32_dpbf16ps_v4sf(__A, __B, __C); +} + +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_dpbf16_ps (__m128 __A, __mmask8 __B, __m128bh __C, __m128bh __D) +{ + return (__m128)__builtin_ia32_dpbf16ps_v4sf_mask(__A, __C, __D, __B); +} + +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_dpbf16_ps (__mmask8 __A, __m128 __B, __m128bh __C, __m128bh __D) +{ + return (__m128)__builtin_ia32_dpbf16ps_v4sf_maskz(__B, __C, __D, __A); +} + +extern __inline __bf16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtness_sbh (float __A) +{ + __v4sf __V = {__A, 0, 0, 0}; + __v8bf __R = __builtin_ia32_cvtneps2bf16_v4sf_mask ((__v4sf)__V, + (__v8bf)_mm_undefined_si128 (), (__mmask8)-1); + return __R[0]; +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpbh_ps (__m128bh __A) +{ + return (__m128)_mm_castsi128_ps ((__m128i)_mm_slli_epi32 ( + (__m128i)_mm_cvtepi16_epi32 ((__m128i)__A), 16)); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtpbh_ps (__m128bh __A) +{ + return (__m256)_mm256_castsi256_ps ((__m256i)_mm256_slli_epi32 ( + (__m256i)_mm256_cvtepi16_epi32 ((__m128i)__A), 16)); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtpbh_ps (__mmask8 __U, __m128bh __A) +{ + return (__m128)_mm_castsi128_ps ((__m128i)_mm_slli_epi32 ( + (__m128i)_mm_maskz_cvtepi16_epi32 ( + (__mmask8)__U, (__m128i)__A), 16)); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtpbh_ps (__mmask8 __U, __m128bh __A) +{ + return (__m256)_mm256_castsi256_ps ((__m256i)_mm256_slli_epi32 ( + (__m256i)_mm256_maskz_cvtepi16_epi32 ( + (__mmask8)__U, (__m128i)__A), 16)); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtpbh_ps (__m128 __S, __mmask8 __U, __m128bh __A) +{ + return (__m128)_mm_castsi128_ps ((__m128i)_mm_mask_slli_epi32 ( + (__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32 ( + (__m128i)__A), 16)); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtpbh_ps (__m256 __S, __mmask8 __U, __m128bh __A) +{ + return (__m256)_mm256_castsi256_ps ((__m256i)_mm256_mask_slli_epi32 ( + (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32 ( + (__m128i)__A), 16)); +} + +#ifdef __DISABLE_AVX512BF16VL__ +#undef __DISABLE_AVX512BF16VL__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512BF16VL__ */ + +#endif /* _AVX512BF16VLINTRIN_H_INCLUDED */ diff --git a/include-gcc/avx512bitalgintrin.h b/include-gcc/avx512bitalgintrin.h new file mode 100644 index 0000000..aa6d652 --- /dev/null +++ b/include-gcc/avx512bitalgintrin.h @@ -0,0 +1,283 @@ +/* Copyright (C) 2017-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#if !defined _IMMINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _AVX512BITALGINTRIN_H_INCLUDED +#define _AVX512BITALGINTRIN_H_INCLUDED + +#ifndef __AVX512BITALG__ +#pragma GCC push_options +#pragma GCC target("avx512bitalg") +#define __DISABLE_AVX512BITALG__ +#endif /* __AVX512BITALG__ */ + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_popcnt_epi8 (__m512i __A) +{ + return (__m512i) __builtin_ia32_vpopcountb_v64qi ((__v64qi) __A); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_popcnt_epi16 (__m512i __A) +{ + return (__m512i) __builtin_ia32_vpopcountw_v32hi ((__v32hi) __A); +} + +#ifdef __DISABLE_AVX512BITALG__ +#undef __DISABLE_AVX512BITALG__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512BITALG__ */ + +#if !defined(__AVX512BITALG__) || !defined(__AVX512BW__) +#pragma GCC push_options +#pragma GCC target("avx512bitalg,avx512bw") +#define __DISABLE_AVX512BITALGBW__ +#endif /* __AVX512VLBW__ */ + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_popcnt_epi8 (__m512i __W, __mmask64 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_vpopcountb_v64qi_mask ((__v64qi) __A, + (__v64qi) __W, + (__mmask64) __U); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_popcnt_epi8 (__mmask64 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_vpopcountb_v64qi_mask ((__v64qi) __A, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_popcnt_epi16 (__m512i __W, __mmask32 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_vpopcountw_v32hi_mask ((__v32hi) __A, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_popcnt_epi16 (__mmask32 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_vpopcountw_v32hi_mask ((__v32hi) __A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +extern __inline __mmask64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_bitshuffle_epi64_mask (__m512i __A, __m512i __B) +{ + return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__mmask64) -1); +} + +extern __inline __mmask64 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_bitshuffle_epi64_mask (__mmask64 __M, __m512i __A, __m512i __B) +{ + return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__mmask64) __M); +} + +#ifdef __DISABLE_AVX512BITALGBW__ +#undef __DISABLE_AVX512BITALGBW__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512BITALGBW__ */ + +#if !defined(__AVX512BITALG__) || !defined(__AVX512VL__) || !defined(__AVX512BW__) +#pragma GCC push_options +#pragma GCC target("avx512bitalg,avx512vl,avx512bw") +#define __DISABLE_AVX512BITALGVLBW__ +#endif /* __AVX512VLBW__ */ + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_popcnt_epi8 (__m256i __W, __mmask32 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vpopcountb_v32qi_mask ((__v32qi) __A, + (__v32qi) __W, + (__mmask32) __U); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_popcnt_epi8 (__mmask32 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vpopcountb_v32qi_mask ((__v32qi) __A, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __U); +} + +extern __inline __mmask32 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_bitshuffle_epi64_mask (__m256i __A, __m256i __B) +{ + return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__mmask32) -1); +} + +extern __inline __mmask32 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_bitshuffle_epi64_mask (__mmask32 __M, __m256i __A, __m256i __B) +{ + return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__mmask32) __M); +} + +#ifdef __DISABLE_AVX512BITALGVLBW__ +#undef __DISABLE_AVX512BITALGVLBW__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512BITALGVLBW__ */ + + +#if !defined(__AVX512BITALG__) || !defined(__AVX512VL__) +#pragma GCC push_options +#pragma GCC target("avx512bitalg,avx512vl") +#define __DISABLE_AVX512BITALGVL__ +#endif /* __AVX512VLBW__ */ + +extern __inline __mmask16 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_bitshuffle_epi64_mask (__m128i __A, __m128i __B) +{ + return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__mmask16) -1); +} + +extern __inline __mmask16 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_bitshuffle_epi64_mask (__mmask16 __M, __m128i __A, __m128i __B) +{ + return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__mmask16) __M); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_popcnt_epi8 (__m256i __A) +{ + return (__m256i) __builtin_ia32_vpopcountb_v32qi ((__v32qi) __A); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_popcnt_epi16 (__m256i __A) +{ + return (__m256i) __builtin_ia32_vpopcountw_v16hi ((__v16hi) __A); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_popcnt_epi8 (__m128i __A) +{ + return (__m128i) __builtin_ia32_vpopcountb_v16qi ((__v16qi) __A); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_popcnt_epi16 (__m128i __A) +{ + return (__m128i) __builtin_ia32_vpopcountw_v8hi ((__v8hi) __A); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_popcnt_epi16 (__m256i __W, __mmask16 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vpopcountw_v16hi_mask ((__v16hi) __A, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_popcnt_epi16 (__mmask16 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vpopcountw_v16hi_mask ((__v16hi) __A, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_popcnt_epi8 (__m128i __W, __mmask16 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vpopcountb_v16qi_mask ((__v16qi) __A, + (__v16qi) __W, + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_popcnt_epi8 (__mmask16 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vpopcountb_v16qi_mask ((__v16qi) __A, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __U); +} +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_popcnt_epi16 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vpopcountw_v8hi_mask ((__v8hi) __A, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_popcnt_epi16 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vpopcountw_v8hi_mask ((__v8hi) __A, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} +#ifdef __DISABLE_AVX512BITALGVL__ +#undef __DISABLE_AVX512BITALGVL__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512BITALGBW__ */ + +#endif /* _AVX512BITALGINTRIN_H_INCLUDED */ diff --git a/include-gcc/avx512bwintrin.h b/include-gcc/avx512bwintrin.h new file mode 100644 index 0000000..89790f7 --- /dev/null +++ b/include-gcc/avx512bwintrin.h @@ -0,0 +1,3333 @@ +/* Copyright (C) 2014-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512BWINTRIN_H_INCLUDED +#define _AVX512BWINTRIN_H_INCLUDED + +#ifndef __AVX512BW__ +#pragma GCC push_options +#pragma GCC target("avx512bw") +#define __DISABLE_AVX512BW__ +#endif /* __AVX512BW__ */ + +/* Internal data types for implementing the intrinsics. */ +typedef short __v32hi __attribute__ ((__vector_size__ (64))); +typedef short __v32hi_u __attribute__ ((__vector_size__ (64), \ + __may_alias__, __aligned__ (1))); +typedef char __v64qi __attribute__ ((__vector_size__ (64))); +typedef char __v64qi_u __attribute__ ((__vector_size__ (64), \ + __may_alias__, __aligned__ (1))); + +typedef unsigned long long __mmask64; + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_ktest_mask32_u8 (__mmask32 __A, __mmask32 __B, unsigned char *__CF) +{ + *__CF = (unsigned char) __builtin_ia32_ktestcsi (__A, __B); + return (unsigned char) __builtin_ia32_ktestzsi (__A, __B); +} + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_ktest_mask64_u8 (__mmask64 __A, __mmask64 __B, unsigned char *__CF) +{ + *__CF = (unsigned char) __builtin_ia32_ktestcdi (__A, __B); + return (unsigned char) __builtin_ia32_ktestzdi (__A, __B); +} + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_ktestz_mask32_u8 (__mmask32 __A, __mmask32 __B) +{ + return (unsigned char) __builtin_ia32_ktestzsi (__A, __B); +} + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_ktestz_mask64_u8 (__mmask64 __A, __mmask64 __B) +{ + return (unsigned char) __builtin_ia32_ktestzdi (__A, __B); +} + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_ktestc_mask32_u8 (__mmask32 __A, __mmask32 __B) +{ + return (unsigned char) __builtin_ia32_ktestcsi (__A, __B); +} + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_ktestc_mask64_u8 (__mmask64 __A, __mmask64 __B) +{ + return (unsigned char) __builtin_ia32_ktestcdi (__A, __B); +} + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortest_mask32_u8 (__mmask32 __A, __mmask32 __B, unsigned char *__CF) +{ + *__CF = (unsigned char) __builtin_ia32_kortestcsi (__A, __B); + return (unsigned char) __builtin_ia32_kortestzsi (__A, __B); +} + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortest_mask64_u8 (__mmask64 __A, __mmask64 __B, unsigned char *__CF) +{ + *__CF = (unsigned char) __builtin_ia32_kortestcdi (__A, __B); + return (unsigned char) __builtin_ia32_kortestzdi (__A, __B); +} + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortestz_mask32_u8 (__mmask32 __A, __mmask32 __B) +{ + return (unsigned char) __builtin_ia32_kortestzsi (__A, __B); +} + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortestz_mask64_u8 (__mmask64 __A, __mmask64 __B) +{ + return (unsigned char) __builtin_ia32_kortestzdi (__A, __B); +} + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortestc_mask32_u8 (__mmask32 __A, __mmask32 __B) +{ + return (unsigned char) __builtin_ia32_kortestcsi (__A, __B); +} + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortestc_mask64_u8 (__mmask64 __A, __mmask64 __B) +{ + return (unsigned char) __builtin_ia32_kortestcdi (__A, __B); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kadd_mask32 (__mmask32 __A, __mmask32 __B) +{ + return (__mmask32) __builtin_ia32_kaddsi ((__mmask32) __A, (__mmask32) __B); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kadd_mask64 (__mmask64 __A, __mmask64 __B) +{ + return (__mmask64) __builtin_ia32_kadddi ((__mmask64) __A, (__mmask64) __B); +} + +extern __inline unsigned int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_cvtmask32_u32 (__mmask32 __A) +{ + return (unsigned int) __builtin_ia32_kmovd ((__mmask32) __A); +} + +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_cvtmask64_u64 (__mmask64 __A) +{ + return (unsigned long long) __builtin_ia32_kmovq ((__mmask64) __A); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_cvtu32_mask32 (unsigned int __A) +{ + return (__mmask32) __builtin_ia32_kmovd ((__mmask32) __A); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_cvtu64_mask64 (unsigned long long __A) +{ + return (__mmask64) __builtin_ia32_kmovq ((__mmask64) __A); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_load_mask32 (__mmask32 *__A) +{ + return (__mmask32) __builtin_ia32_kmovd (*__A); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_load_mask64 (__mmask64 *__A) +{ + return (__mmask64) __builtin_ia32_kmovq (*(__mmask64 *) __A); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_store_mask32 (__mmask32 *__A, __mmask32 __B) +{ + *(__mmask32 *) __A = __builtin_ia32_kmovd (__B); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_store_mask64 (__mmask64 *__A, __mmask64 __B) +{ + *(__mmask64 *) __A = __builtin_ia32_kmovq (__B); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_knot_mask32 (__mmask32 __A) +{ + return (__mmask32) __builtin_ia32_knotsi ((__mmask32) __A); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_knot_mask64 (__mmask64 __A) +{ + return (__mmask64) __builtin_ia32_knotdi ((__mmask64) __A); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kor_mask32 (__mmask32 __A, __mmask32 __B) +{ + return (__mmask32) __builtin_ia32_korsi ((__mmask32) __A, (__mmask32) __B); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kor_mask64 (__mmask64 __A, __mmask64 __B) +{ + return (__mmask64) __builtin_ia32_kordi ((__mmask64) __A, (__mmask64) __B); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kxnor_mask32 (__mmask32 __A, __mmask32 __B) +{ + return (__mmask32) __builtin_ia32_kxnorsi ((__mmask32) __A, (__mmask32) __B); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kxnor_mask64 (__mmask64 __A, __mmask64 __B) +{ + return (__mmask64) __builtin_ia32_kxnordi ((__mmask64) __A, (__mmask64) __B); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kxor_mask32 (__mmask32 __A, __mmask32 __B) +{ + return (__mmask32) __builtin_ia32_kxorsi ((__mmask32) __A, (__mmask32) __B); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kxor_mask64 (__mmask64 __A, __mmask64 __B) +{ + return (__mmask64) __builtin_ia32_kxordi ((__mmask64) __A, (__mmask64) __B); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kand_mask32 (__mmask32 __A, __mmask32 __B) +{ + return (__mmask32) __builtin_ia32_kandsi ((__mmask32) __A, (__mmask32) __B); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kand_mask64 (__mmask64 __A, __mmask64 __B) +{ + return (__mmask64) __builtin_ia32_kanddi ((__mmask64) __A, (__mmask64) __B); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kandn_mask32 (__mmask32 __A, __mmask32 __B) +{ + return (__mmask32) __builtin_ia32_kandnsi ((__mmask32) __A, (__mmask32) __B); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kandn_mask64 (__mmask64 __A, __mmask64 __B) +{ + return (__mmask64) __builtin_ia32_kandndi ((__mmask64) __A, (__mmask64) __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_movdquhi512_mask ((__v32hi) __A, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mov_epi16 (__mmask32 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_movdquhi512_mask ((__v32hi) __A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_loadu_epi16 (void const *__P) +{ + return (__m512i) (*(__v32hi_u *) __P); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_loadu_epi16 (__m512i __W, __mmask32 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_loaddquhi512_mask ((const short *) __P, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_loadu_epi16 (__mmask32 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_loaddquhi512_mask ((const short *) __P, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_storeu_epi16 (void *__P, __m512i __A) +{ + *(__v32hi_u *) __P = (__v32hi_u) __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_storeu_epi16 (void *__P, __mmask32 __U, __m512i __A) +{ + __builtin_ia32_storedquhi512_mask ((short *) __P, + (__v32hi) __A, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mov_epi8 (__m512i __W, __mmask64 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_movdquqi512_mask ((__v64qi) __A, + (__v64qi) __W, + (__mmask64) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_movdquqi512_mask ((__v64qi) __A, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_kunpackw (__mmask32 __A, __mmask32 __B) +{ + return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A, + (__mmask32) __B); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kunpackw_mask32 (__mmask16 __A, __mmask16 __B) +{ + return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A, + (__mmask32) __B); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_kunpackd (__mmask64 __A, __mmask64 __B) +{ + return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A, + (__mmask64) __B); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kunpackd_mask64 (__mmask32 __A, __mmask32 __B) +{ + return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A, + (__mmask64) __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_loadu_epi8 (void const *__P) +{ + return (__m512i) (*(__v64qi_u *) __P); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_loadu_epi8 (__m512i __W, __mmask64 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_loaddquqi512_mask ((const char *) __P, + (__v64qi) __W, + (__mmask64) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_loadu_epi8 (__mmask64 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_loaddquqi512_mask ((const char *) __P, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_storeu_epi8 (void *__P, __m512i __A) +{ + *(__v64qi_u *) __P = (__v64qi_u) __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_storeu_epi8 (void *__P, __mmask64 __U, __m512i __A) +{ + __builtin_ia32_storedquqi512_mask ((char *) __P, + (__v64qi) __A, + (__mmask64) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sad_epu8 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psadbw512 ((__v64qi) __A, + (__v64qi) __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi16_epi8 (__m512i __A) +{ + return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A, + (__v32qi) _mm256_undefined_si256(), + (__mmask32) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A) +{ + __builtin_ia32_pmovwb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A, + (__v32qi) __O, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi16_epi8 (__mmask32 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A, + (__v32qi) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtsepi16_epi8 (__m512i __A) +{ + return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A, + (__v32qi)_mm256_undefined_si256(), + (__mmask32) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A) +{ + __builtin_ia32_pmovswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtsepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A, + (__v32qi)__O, + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtsepi16_epi8 (__mmask32 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A, + (__v32qi) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtusepi16_epi8 (__m512i __A) +{ + return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A, + (__v32qi)_mm256_undefined_si256(), + (__mmask32) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtusepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A, + (__v32qi) __O, + __M); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A) +{ + __builtin_ia32_pmovuswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtusepi16_epi8 (__mmask32 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A, + (__v32qi) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcastb_epi8 (__m128i __A) +{ + return (__m512i) __builtin_ia32_pbroadcastb512_mask ((__v16qi) __A, + (__v64qi)_mm512_undefined_epi32(), + (__mmask64) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcastb_epi8 (__m512i __O, __mmask64 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_pbroadcastb512_mask ((__v16qi) __A, + (__v64qi) __O, + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcastb_epi8 (__mmask64 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_pbroadcastb512_mask ((__v16qi) __A, + (__v64qi) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A) +{ + return (__m512i) __builtin_ia32_pbroadcastb512_gpr_mask (__A, + (__v64qi) __O, + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_set1_epi8 (__mmask64 __M, char __A) +{ + return (__m512i) + __builtin_ia32_pbroadcastb512_gpr_mask (__A, + (__v64qi) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcastw_epi16 (__m128i __A) +{ + return (__m512i) __builtin_ia32_pbroadcastw512_mask ((__v8hi) __A, + (__v32hi)_mm512_undefined_epi32(), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcastw_epi16 (__m512i __O, __mmask32 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_pbroadcastw512_mask ((__v8hi) __A, + (__v32hi) __O, + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcastw_epi16 (__mmask32 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_pbroadcastw512_mask ((__v8hi) __A, + (__v32hi) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A) +{ + return (__m512i) __builtin_ia32_pbroadcastw512_gpr_mask (__A, + (__v32hi) __O, + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_set1_epi16 (__mmask32 __M, short __A) +{ + return (__m512i) + __builtin_ia32_pbroadcastw512_gpr_mask (__A, + (__v32hi) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mulhrs_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mulhrs_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mulhrs_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmulhrsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mulhi_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mulhi_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mulhi_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmulhw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mulhi_epu16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mulhi_epu16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mulhi_epu16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmulhuw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mullo_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v32hu) __A * (__v32hu) __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mullo_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mullo_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmullw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi8_epi16 (__m256i __A) +{ + return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi8_epi16 (__m512i __W, __mmask32 __U, __m256i __A) +{ + return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi8_epi16 (__mmask32 __U, __m256i __A) +{ + return (__m512i) __builtin_ia32_pmovsxbw512_mask ((__v32qi) __A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepu8_epi16 (__m256i __A) +{ + return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu8_epi16 (__m512i __W, __mmask32 __U, __m256i __A) +{ + return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu8_epi16 (__mmask32 __U, __m256i __A) +{ + return (__m512i) __builtin_ia32_pmovzxbw512_mask ((__v32qi) __A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutexvar_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B, + (__v32hi) __A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutexvar_epi16 (__mmask32 __M, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B, + (__v32hi) __A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutexvar_epi16 (__m512i __W, __mmask32 __M, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_permvarhi512_mask ((__v32hi) __B, + (__v32hi) __A, + (__v32hi) __W, + (__mmask32) __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutex2var_epi16 (__m512i __A, __m512i __I, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermt2varhi512_mask ((__v32hi) __I + /* idx */ , + (__v32hi) __A, + (__v32hi) __B, + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutex2var_epi16 (__m512i __A, __mmask32 __U, + __m512i __I, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermt2varhi512_mask ((__v32hi) __I + /* idx */ , + (__v32hi) __A, + (__v32hi) __B, + (__mmask32) + __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask2_permutex2var_epi16 (__m512i __A, __m512i __I, + __mmask32 __U, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermi2varhi512_mask ((__v32hi) __A, + (__v32hi) __I + /* idx */ , + (__v32hi) __B, + (__mmask32) + __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutex2var_epi16 (__mmask32 __U, __m512i __A, + __m512i __I, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermt2varhi512_maskz ((__v32hi) __I + /* idx */ , + (__v32hi) __A, + (__v32hi) __B, + (__mmask32) + __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_avg_epu8 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_avg_epu8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_avg_epu8 (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pavgb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_add_epi8 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v64qu) __A + (__v64qu) __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_add_epi8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_add_epi8 (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sub_epi8 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v64qu) __A - (__v64qu) __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sub_epi8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sub_epi8 (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_avg_epu16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_avg_epu16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_avg_epu16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pavgw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_subs_epi8 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_subs_epi8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_subs_epi8 (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubsb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_subs_epu8 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_subs_epu8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_subs_epu8 (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubusb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_adds_epi8 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_adds_epi8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_adds_epi8 (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddsb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_adds_epu8 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_adds_epu8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_adds_epu8 (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddusb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sub_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v32hu) __A - (__v32hu) __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sub_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sub_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_subs_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_subs_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_subs_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_subs_epu16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_subs_epu16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubusw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_add_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v32hu) __A + (__v32hu) __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_add_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_add_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_adds_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_adds_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_adds_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_adds_epu16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_adds_epu16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddusw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srl_epi16 (__m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A, + (__v8hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srl_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m128i __B) +{ + return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A, + (__v8hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srl_epi16 (__mmask32 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psrlw512_mask ((__v32hi) __A, + (__v8hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_packs_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sll_epi16 (__m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A, + (__v8hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sll_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m128i __B) +{ + return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A, + (__v8hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sll_epi16 (__mmask32 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psllw512_mask ((__v32hi) __A, + (__v8hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maddubs_epi16 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X, + (__v64qi) __Y, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_maddubs_epi16 (__m512i __W, __mmask32 __U, __m512i __X, + __m512i __Y) +{ + return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X, + (__v64qi) __Y, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_maddubs_epi16 (__mmask32 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_pmaddubsw512_mask ((__v64qi) __X, + (__v64qi) __Y, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_madd_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_madd_epi16 (__m512i __W, __mmask16 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_madd_epi16 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaddwd512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_unpackhi_epi8 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_unpackhi_epi8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpackhi_epi8 (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckhbw512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_unpackhi_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_unpackhi_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpackhi_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckhwd512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_unpacklo_epi8 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_unpacklo_epi8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpacklo_epi8 (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpcklbw512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_unpacklo_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_unpacklo_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpacklo_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpcklwd512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpeq_epu8_mask (__m512i __A, __m512i __B) +{ + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __A, + (__v64qi) __B, 0, + (__mmask64) -1); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpeq_epi8_mask (__m512i __A, __m512i __B) +{ + return (__mmask64) __builtin_ia32_pcmpeqb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__mmask64) -1); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpeq_epu8_mask (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __A, + (__v64qi) __B, 0, + __U); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpeq_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__mmask64) __builtin_ia32_pcmpeqb512_mask ((__v64qi) __A, + (__v64qi) __B, + __U); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpeq_epu16_mask (__m512i __A, __m512i __B) +{ + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __A, + (__v32hi) __B, 0, + (__mmask32) -1); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpeq_epi16_mask (__m512i __A, __m512i __B) +{ + return (__mmask32) __builtin_ia32_pcmpeqw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__mmask32) -1); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpeq_epu16_mask (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __A, + (__v32hi) __B, 0, + __U); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpeq_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__mmask32) __builtin_ia32_pcmpeqw512_mask ((__v32hi) __A, + (__v32hi) __B, + __U); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpgt_epu8_mask (__m512i __A, __m512i __B) +{ + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __A, + (__v64qi) __B, 6, + (__mmask64) -1); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpgt_epi8_mask (__m512i __A, __m512i __B) +{ + return (__mmask64) __builtin_ia32_pcmpgtb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__mmask64) -1); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpgt_epu8_mask (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __A, + (__v64qi) __B, 6, + __U); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpgt_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__mmask64) __builtin_ia32_pcmpgtb512_mask ((__v64qi) __A, + (__v64qi) __B, + __U); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpgt_epu16_mask (__m512i __A, __m512i __B) +{ + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __A, + (__v32hi) __B, 6, + (__mmask32) -1); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpgt_epi16_mask (__m512i __A, __m512i __B) +{ + return (__mmask32) __builtin_ia32_pcmpgtw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__mmask32) -1); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpgt_epu16_mask (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __A, + (__v32hi) __B, 6, + __U); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpgt_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__mmask32) __builtin_ia32_pcmpgtw512_mask ((__v32hi) __A, + (__v32hi) __B, + __U); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_movepi8_mask (__m512i __A) +{ + return (__mmask64) __builtin_ia32_cvtb2mask512 ((__v64qi) __A); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_movepi16_mask (__m512i __A) +{ + return (__mmask32) __builtin_ia32_cvtw2mask512 ((__v32hi) __A); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_movm_epi8 (__mmask64 __A) +{ + return (__m512i) __builtin_ia32_cvtmask2b512 (__A); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_movm_epi16 (__mmask32 __A) +{ + return (__m512i) __builtin_ia32_cvtmask2w512 (__A); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_test_epi8_mask (__m512i __A, __m512i __B) +{ + return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A, + (__v64qi) __B, + (__mmask64) -1); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_test_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__mmask64) __builtin_ia32_ptestmb512 ((__v64qi) __A, + (__v64qi) __B, __U); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_test_epi16_mask (__m512i __A, __m512i __B) +{ + return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A, + (__v32hi) __B, + (__mmask32) -1); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_test_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__mmask32) __builtin_ia32_ptestmw512 ((__v32hi) __A, + (__v32hi) __B, __U); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_testn_epi8_mask (__m512i __A, __m512i __B) +{ + return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A, + (__v64qi) __B, + (__mmask64) -1); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_testn_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__mmask64) __builtin_ia32_ptestnmb512 ((__v64qi) __A, + (__v64qi) __B, __U); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_testn_epi16_mask (__m512i __A, __m512i __B) +{ + return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A, + (__v32hi) __B, + (__mmask32) -1); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_testn_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__mmask32) __builtin_ia32_ptestnmw512 ((__v32hi) __A, + (__v32hi) __B, __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shuffle_epi8 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shuffle_epi8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shuffle_epi8 (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pshufb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_epu16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_epu16 (__mmask32 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_epu16 (__m512i __W, __mmask32 __M, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pminuw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_epi16 (__mmask32 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_epi16 (__m512i __W, __mmask32 __M, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pminsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_epu8 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_epu8 (__mmask64 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_epu8 (__m512i __W, __mmask64 __M, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxub512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_epi8 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_epi8 (__mmask64 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_epi8 (__m512i __W, __mmask64 __M, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxsb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_epu8 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_epu8 (__mmask64 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_epu8 (__m512i __W, __mmask64 __M, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pminub512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_epi8 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_epi8 (__mmask64 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_epi8 (__m512i __W, __mmask64 __M, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pminsb512_mask ((__v64qi) __A, + (__v64qi) __B, + (__v64qi) __W, + (__mmask64) __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_epi16 (__mmask32 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_epi16 (__m512i __W, __mmask32 __M, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxsw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_epu16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_epu16 (__mmask32 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_epu16 (__m512i __W, __mmask32 __M, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxuw512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sra_epi16 (__m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A, + (__v8hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sra_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m128i __B) +{ + return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A, + (__v8hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sra_epi16 (__mmask32 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psraw512_mask ((__v32hi) __A, + (__v8hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srav_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srav_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srav_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psrav32hi_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srlv_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srlv_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srlv_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psrlv32hi_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sllv_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sllv_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sllv_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psllv32hi_mask ((__v32hi) __A, + (__v32hi) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_packs_epi16 (__m512i __W, __mmask64 __M, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v64qi) __W, + (__mmask64) __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_packs_epi16 (__mmask64 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_packsswb512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v64qi) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_packus_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_packus_epi16 (__m512i __W, __mmask64 __M, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v64qi) __W, + (__mmask64) __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_packus_epi16 (__mmask64 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_packuswb512_mask ((__v32hi) __A, + (__v32hi) __B, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_abs_epi8 (__m512i __A) +{ + return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_abs_epi8 (__m512i __W, __mmask64 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A, + (__v64qi) __W, + (__mmask64) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_abs_epi8 (__mmask64 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_pabsb512_mask ((__v64qi) __A, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_abs_epi16 (__m512i __A) +{ + return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_abs_epi16 (__m512i __W, __mmask32 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_pabsw512_mask ((__v32hi) __A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpneq_epu8_mask (__mmask64 __M, __m512i __X, __m512i __Y) +{ + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 4, + (__mmask64) __M); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmplt_epu8_mask (__mmask64 __M, __m512i __X, __m512i __Y) +{ + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 1, + (__mmask64) __M); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpge_epu8_mask (__mmask64 __M, __m512i __X, __m512i __Y) +{ + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 5, + (__mmask64) __M); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmple_epu8_mask (__mmask64 __M, __m512i __X, __m512i __Y) +{ + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 2, + (__mmask64) __M); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpneq_epu16_mask (__mmask32 __M, __m512i __X, __m512i __Y) +{ + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 4, + (__mmask32) __M); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmplt_epu16_mask (__mmask32 __M, __m512i __X, __m512i __Y) +{ + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 1, + (__mmask32) __M); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpge_epu16_mask (__mmask32 __M, __m512i __X, __m512i __Y) +{ + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 5, + (__mmask32) __M); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmple_epu16_mask (__mmask32 __M, __m512i __X, __m512i __Y) +{ + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 2, + (__mmask32) __M); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpneq_epi8_mask (__mmask64 __M, __m512i __X, __m512i __Y) +{ + return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 4, + (__mmask64) __M); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmplt_epi8_mask (__mmask64 __M, __m512i __X, __m512i __Y) +{ + return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 1, + (__mmask64) __M); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpge_epi8_mask (__mmask64 __M, __m512i __X, __m512i __Y) +{ + return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 5, + (__mmask64) __M); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmple_epi8_mask (__mmask64 __M, __m512i __X, __m512i __Y) +{ + return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 2, + (__mmask64) __M); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpneq_epi16_mask (__mmask32 __M, __m512i __X, __m512i __Y) +{ + return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 4, + (__mmask32) __M); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmplt_epi16_mask (__mmask32 __M, __m512i __X, __m512i __Y) +{ + return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 1, + (__mmask32) __M); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpge_epi16_mask (__mmask32 __M, __m512i __X, __m512i __Y) +{ + return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 5, + (__mmask32) __M); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmple_epi16_mask (__mmask32 __M, __m512i __X, __m512i __Y) +{ + return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 2, + (__mmask32) __M); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpneq_epu8_mask (__m512i __X, __m512i __Y) +{ + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 4, + (__mmask64) -1); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmplt_epu8_mask (__m512i __X, __m512i __Y) +{ + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 1, + (__mmask64) -1); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpge_epu8_mask (__m512i __X, __m512i __Y) +{ + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 5, + (__mmask64) -1); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmple_epu8_mask (__m512i __X, __m512i __Y) +{ + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 2, + (__mmask64) -1); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpneq_epu16_mask (__m512i __X, __m512i __Y) +{ + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 4, + (__mmask32) -1); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmplt_epu16_mask (__m512i __X, __m512i __Y) +{ + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 1, + (__mmask32) -1); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpge_epu16_mask (__m512i __X, __m512i __Y) +{ + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 5, + (__mmask32) -1); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmple_epu16_mask (__m512i __X, __m512i __Y) +{ + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 2, + (__mmask32) -1); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpneq_epi8_mask (__m512i __X, __m512i __Y) +{ + return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 4, + (__mmask64) -1); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmplt_epi8_mask (__m512i __X, __m512i __Y) +{ + return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 1, + (__mmask64) -1); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpge_epi8_mask (__m512i __X, __m512i __Y) +{ + return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 5, + (__mmask64) -1); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmple_epi8_mask (__m512i __X, __m512i __Y) +{ + return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, 2, + (__mmask64) -1); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpneq_epi16_mask (__m512i __X, __m512i __Y) +{ + return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 4, + (__mmask32) -1); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmplt_epi16_mask (__m512i __X, __m512i __Y) +{ + return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 1, + (__mmask32) -1); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpge_epi16_mask (__m512i __X, __m512i __Y) +{ + return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 5, + (__mmask32) -1); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmple_epi16_mask (__m512i __X, __m512i __Y) +{ + return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, 2, + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_packs_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A, + (__v16si) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_packs_epi32 (__mmask32 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A, + (__v16si) __B, + (__v32hi) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_packs_epi32 (__m512i __W, __mmask32 __M, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_packssdw512_mask ((__v16si) __A, + (__v16si) __B, + (__v32hi) __W, + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_packus_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A, + (__v16si) __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_packus_epi32 (__mmask32 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A, + (__v16si) __B, + (__v32hi) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_packus_epi32 (__m512i __W, __mmask32 __M, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_packusdw512_mask ((__v16si) __A, + (__v16si) __B, + (__v32hi) __W, + __M); +} + +#ifdef __OPTIMIZE__ +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kshiftli_mask32 (__mmask32 __A, unsigned int __B) +{ + return (__mmask32) __builtin_ia32_kshiftlisi ((__mmask32) __A, + (__mmask8) __B); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kshiftli_mask64 (__mmask64 __A, unsigned int __B) +{ + return (__mmask64) __builtin_ia32_kshiftlidi ((__mmask64) __A, + (__mmask8) __B); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kshiftri_mask32 (__mmask32 __A, unsigned int __B) +{ + return (__mmask32) __builtin_ia32_kshiftrisi ((__mmask32) __A, + (__mmask8) __B); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kshiftri_mask64 (__mmask64 __A, unsigned int __B) +{ + return (__mmask64) __builtin_ia32_kshiftridi ((__mmask64) __A, + (__mmask8) __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_alignr_epi8 (__m512i __A, __m512i __B, const int __N) +{ + return (__m512i) __builtin_ia32_palignr512 ((__v8di) __A, + (__v8di) __B, __N * 8); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_alignr_epi8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B, const int __N) +{ + return (__m512i) __builtin_ia32_palignr512_mask ((__v8di) __A, + (__v8di) __B, + __N * 8, + (__v8di) __W, + (__mmask64) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_alignr_epi8 (__mmask64 __U, __m512i __A, __m512i __B, + const int __N) +{ + return (__m512i) __builtin_ia32_palignr512_mask ((__v8di) __A, + (__v8di) __B, + __N * 8, + (__v8di) + _mm512_setzero_si512 (), + (__mmask64) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_dbsad_epu8 (__m512i __A, __m512i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi) __A, + (__v64qi) __B, + __imm, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_dbsad_epu8 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi) __A, + (__v64qi) __B, + __imm, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_dbsad_epu8 (__mmask32 __U, __m512i __A, __m512i __B, + const int __imm) +{ + return (__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi) __A, + (__v64qi) __B, + __imm, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srli_epi16 (__m512i __A, const int __imm) +{ + return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srli_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + const int __imm) +{ + return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srli_epi16 (__mmask32 __U, __m512i __A, const int __imm) +{ + return (__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi) __A, __imm, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_slli_epi16 (__m512i __A, const int __B) +{ + return (__m512i) __builtin_ia32_psllwi512_mask ((__v32hi) __A, __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_slli_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + const int __B) +{ + return (__m512i) __builtin_ia32_psllwi512_mask ((__v32hi) __A, __B, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_slli_epi16 (__mmask32 __U, __m512i __A, const int __B) +{ + return (__m512i) __builtin_ia32_psllwi512_mask ((__v32hi) __A, __B, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shufflehi_epi16 (__m512i __A, const int __imm) +{ + return (__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi) __A, + __imm, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shufflehi_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + const int __imm) +{ + return (__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi) __A, + __imm, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shufflehi_epi16 (__mmask32 __U, __m512i __A, + const int __imm) +{ + return (__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi) __A, + __imm, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shufflelo_epi16 (__m512i __A, const int __imm) +{ + return (__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi) __A, + __imm, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shufflelo_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + const int __imm) +{ + return (__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi) __A, + __imm, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shufflelo_epi16 (__mmask32 __U, __m512i __A, + const int __imm) +{ + return (__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi) __A, + __imm, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srai_epi16 (__m512i __A, const int __imm) +{ + return (__m512i) __builtin_ia32_psrawi512_mask ((__v32hi) __A, __imm, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srai_epi16 (__m512i __W, __mmask32 __U, __m512i __A, + const int __imm) +{ + return (__m512i) __builtin_ia32_psrawi512_mask ((__v32hi) __A, __imm, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srai_epi16 (__mmask32 __U, __m512i __A, const int __imm) +{ + return (__m512i) __builtin_ia32_psrawi512_mask ((__v32hi) __A, __imm, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_blend_epi16 (__mmask32 __U, __m512i __A, __m512i __W) +{ + return (__m512i) __builtin_ia32_blendmw_512_mask ((__v32hi) __A, + (__v32hi) __W, + (__mmask32) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_blend_epi8 (__mmask64 __U, __m512i __A, __m512i __W) +{ + return (__m512i) __builtin_ia32_blendmb_512_mask ((__v64qi) __A, + (__v64qi) __W, + (__mmask64) __U); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_epi16_mask (__mmask32 __U, __m512i __X, __m512i __Y, + const int __P) +{ + return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, __P, + (__mmask32) __U); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_epi16_mask (__m512i __X, __m512i __Y, const int __P) +{ + return (__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, __P, + (__mmask32) -1); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_epi8_mask (__mmask64 __U, __m512i __X, __m512i __Y, + const int __P) +{ + return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, __P, + (__mmask64) __U); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_epi8_mask (__m512i __X, __m512i __Y, const int __P) +{ + return (__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, __P, + (__mmask64) -1); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_epu16_mask (__mmask32 __U, __m512i __X, __m512i __Y, + const int __P) +{ + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, __P, + (__mmask32) __U); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_epu16_mask (__m512i __X, __m512i __Y, const int __P) +{ + return (__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi) __X, + (__v32hi) __Y, __P, + (__mmask32) -1); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_epu8_mask (__mmask64 __U, __m512i __X, __m512i __Y, + const int __P) +{ + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, __P, + (__mmask64) __U); +} + +extern __inline __mmask64 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_epu8_mask (__m512i __X, __m512i __Y, const int __P) +{ + return (__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi) __X, + (__v64qi) __Y, __P, + (__mmask64) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_bslli_epi128 (__m512i __A, const int __N) +{ + return (__m512i) __builtin_ia32_pslldq512 (__A, __N * 8); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_bsrli_epi128 (__m512i __A, const int __N) +{ + return (__m512i) __builtin_ia32_psrldq512 (__A, __N * 8); +} + +#else +#define _kshiftli_mask32(X, Y) \ + ((__mmask32) __builtin_ia32_kshiftlisi ((__mmask32)(X), (__mmask8)(Y))) + +#define _kshiftli_mask64(X, Y) \ + ((__mmask64) __builtin_ia32_kshiftlidi ((__mmask64)(X), (__mmask8)(Y))) + +#define _kshiftri_mask32(X, Y) \ + ((__mmask32) __builtin_ia32_kshiftrisi ((__mmask32)(X), (__mmask8)(Y))) + +#define _kshiftri_mask64(X, Y) \ + ((__mmask64) __builtin_ia32_kshiftridi ((__mmask64)(X), (__mmask8)(Y))) + +#define _mm512_alignr_epi8(X, Y, N) \ + ((__m512i) __builtin_ia32_palignr512 ((__v8di)(__m512i)(X), \ + (__v8di)(__m512i)(Y), \ + (int)((N) * 8))) + +#define _mm512_mask_alignr_epi8(W, U, X, Y, N) \ + ((__m512i) __builtin_ia32_palignr512_mask ((__v8di)(__m512i)(X), \ + (__v8di)(__m512i)(Y), (int)((N) * 8), \ + (__v8di)(__m512i)(W), (__mmask64)(U))) + +#define _mm512_maskz_alignr_epi8(U, X, Y, N) \ + ((__m512i) __builtin_ia32_palignr512_mask ((__v8di)(__m512i)(X), \ + (__v8di)(__m512i)(Y), (int)((N) * 8), \ + (__v8di)(__m512i) \ + _mm512_setzero_si512 (), \ + (__mmask64)(U))) + +#define _mm512_dbsad_epu8(X, Y, C) \ + ((__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi)(__m512i) (X), \ + (__v64qi)(__m512i) (Y), (int) (C), \ + (__v32hi)(__m512i) \ + _mm512_setzero_si512 (), \ + (__mmask32)-1)) + +#define _mm512_mask_dbsad_epu8(W, U, X, Y, C) \ + ((__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi)(__m512i) (X), \ + (__v64qi)(__m512i) (Y), (int) (C), \ + (__v32hi)(__m512i)(W), \ + (__mmask32)(U))) + +#define _mm512_maskz_dbsad_epu8(U, X, Y, C) \ + ((__m512i) __builtin_ia32_dbpsadbw512_mask ((__v64qi)(__m512i) (X), \ + (__v64qi)(__m512i) (Y), (int) (C), \ + (__v32hi)(__m512i) \ + _mm512_setzero_si512 (), \ + (__mmask32)(U))) + +#define _mm512_srli_epi16(A, B) \ + ((__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi)(__m512i)(A), \ + (int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)-1)) + +#define _mm512_mask_srli_epi16(W, U, A, B) \ + ((__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi)(__m512i)(A), \ + (int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U))) + +#define _mm512_maskz_srli_epi16(U, A, B) \ + ((__m512i) __builtin_ia32_psrlwi512_mask ((__v32hi)(__m512i)(A), \ + (int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)(U))) + +#define _mm512_slli_epi16(X, C) \ + ((__m512i)__builtin_ia32_psllwi512_mask ((__v32hi)(__m512i)(X), (int)(C),\ + (__v32hi)(__m512i)_mm512_setzero_si512 (), \ + (__mmask32)-1)) + +#define _mm512_mask_slli_epi16(W, U, X, C) \ + ((__m512i)__builtin_ia32_psllwi512_mask ((__v32hi)(__m512i)(X), (int)(C),\ + (__v32hi)(__m512i)(W),\ + (__mmask32)(U))) + +#define _mm512_maskz_slli_epi16(U, X, C) \ + ((__m512i)__builtin_ia32_psllwi512_mask ((__v32hi)(__m512i)(X), (int)(C),\ + (__v32hi)(__m512i)_mm512_setzero_si512 (), \ + (__mmask32)(U))) + +#define _mm512_shufflehi_epi16(A, B) \ + ((__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi)(__m512i)(A), (int)(B), \ + (__v32hi)(__m512i) \ + _mm512_setzero_si512 (), \ + (__mmask32)-1)) + +#define _mm512_mask_shufflehi_epi16(W, U, A, B) \ + ((__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi)(__m512i)(A), (int)(B), \ + (__v32hi)(__m512i)(W), \ + (__mmask32)(U))) + +#define _mm512_maskz_shufflehi_epi16(U, A, B) \ + ((__m512i) __builtin_ia32_pshufhw512_mask ((__v32hi)(__m512i)(A), (int)(B), \ + (__v32hi)(__m512i) \ + _mm512_setzero_si512 (), \ + (__mmask32)(U))) + +#define _mm512_shufflelo_epi16(A, B) \ + ((__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi)(__m512i)(A), (int)(B), \ + (__v32hi)(__m512i) \ + _mm512_setzero_si512 (), \ + (__mmask32)-1)) + +#define _mm512_mask_shufflelo_epi16(W, U, A, B) \ + ((__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi)(__m512i)(A), (int)(B), \ + (__v32hi)(__m512i)(W), \ + (__mmask32)(U))) + +#define _mm512_maskz_shufflelo_epi16(U, A, B) \ + ((__m512i) __builtin_ia32_pshuflw512_mask ((__v32hi)(__m512i)(A), (int)(B), \ + (__v32hi)(__m512i) \ + _mm512_setzero_si512 (), \ + (__mmask32)(U))) + +#define _mm512_srai_epi16(A, B) \ + ((__m512i) __builtin_ia32_psrawi512_mask ((__v32hi)(__m512i)(A), \ + (int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)-1)) + +#define _mm512_mask_srai_epi16(W, U, A, B) \ + ((__m512i) __builtin_ia32_psrawi512_mask ((__v32hi)(__m512i)(A), \ + (int)(B), (__v32hi)(__m512i)(W), (__mmask32)(U))) + +#define _mm512_maskz_srai_epi16(U, A, B) \ + ((__m512i) __builtin_ia32_psrawi512_mask ((__v32hi)(__m512i)(A), \ + (int)(B), (__v32hi)_mm512_setzero_si512 (), (__mmask32)(U))) + +#define _mm512_mask_blend_epi16(__U, __A, __W) \ + ((__m512i) __builtin_ia32_blendmw_512_mask ((__v32hi) (__A), \ + (__v32hi) (__W), \ + (__mmask32) (__U))) + +#define _mm512_mask_blend_epi8(__U, __A, __W) \ + ((__m512i) __builtin_ia32_blendmb_512_mask ((__v64qi) (__A), \ + (__v64qi) (__W), \ + (__mmask64) (__U))) + +#define _mm512_cmp_epi16_mask(X, Y, P) \ + ((__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi)(__m512i)(X), \ + (__v32hi)(__m512i)(Y), (int)(P),\ + (__mmask32)(-1))) + +#define _mm512_cmp_epi8_mask(X, Y, P) \ + ((__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi)(__m512i)(X), \ + (__v64qi)(__m512i)(Y), (int)(P),\ + (__mmask64)(-1))) + +#define _mm512_cmp_epu16_mask(X, Y, P) \ + ((__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi)(__m512i)(X), \ + (__v32hi)(__m512i)(Y), (int)(P),\ + (__mmask32)(-1))) + +#define _mm512_cmp_epu8_mask(X, Y, P) \ + ((__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi)(__m512i)(X), \ + (__v64qi)(__m512i)(Y), (int)(P),\ + (__mmask64)(-1))) + +#define _mm512_mask_cmp_epi16_mask(M, X, Y, P) \ + ((__mmask32) __builtin_ia32_cmpw512_mask ((__v32hi)(__m512i)(X), \ + (__v32hi)(__m512i)(Y), (int)(P),\ + (__mmask32)(M))) + +#define _mm512_mask_cmp_epi8_mask(M, X, Y, P) \ + ((__mmask64) __builtin_ia32_cmpb512_mask ((__v64qi)(__m512i)(X), \ + (__v64qi)(__m512i)(Y), (int)(P),\ + (__mmask64)(M))) + +#define _mm512_mask_cmp_epu16_mask(M, X, Y, P) \ + ((__mmask32) __builtin_ia32_ucmpw512_mask ((__v32hi)(__m512i)(X), \ + (__v32hi)(__m512i)(Y), (int)(P),\ + (__mmask32)(M))) + +#define _mm512_mask_cmp_epu8_mask(M, X, Y, P) \ + ((__mmask64) __builtin_ia32_ucmpb512_mask ((__v64qi)(__m512i)(X), \ + (__v64qi)(__m512i)(Y), (int)(P),\ + (__mmask64)(M))) + +#define _mm512_bslli_epi128(A, N) \ + ((__m512i)__builtin_ia32_pslldq512 ((__m512i)(A), (int)(N) * 8)) + +#define _mm512_bsrli_epi128(A, N) \ + ((__m512i)__builtin_ia32_psrldq512 ((__m512i)(A), (int)(N) * 8)) + +#endif + +#ifdef __DISABLE_AVX512BW__ +#undef __DISABLE_AVX512BW__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512BW__ */ + +#endif /* _AVX512BWINTRIN_H_INCLUDED */ diff --git a/include-gcc/avx512cdintrin.h b/include-gcc/avx512cdintrin.h new file mode 100644 index 0000000..a5f5eab --- /dev/null +++ b/include-gcc/avx512cdintrin.h @@ -0,0 +1,184 @@ +/* Copyright (C) 2013-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512CDINTRIN_H_INCLUDED +#define _AVX512CDINTRIN_H_INCLUDED + +#ifndef __AVX512CD__ +#pragma GCC push_options +#pragma GCC target("avx512cd") +#define __DISABLE_AVX512CD__ +#endif /* __AVX512CD__ */ + +/* Internal data types for implementing the intrinsics. */ +typedef long long __v8di __attribute__ ((__vector_size__ (64))); +typedef int __v16si __attribute__ ((__vector_size__ (64))); + +/* The Intel API is flexible enough that we must allow aliasing with other + vector types, and their scalar components. */ +typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__)); +typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__)); + +typedef unsigned char __mmask8; +typedef unsigned short __mmask16; + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_conflict_epi32 (__m512i __A) +{ + return (__m512i) + __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A, + (__v16si) _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A) +{ + return (__m512i) + __builtin_ia32_vpconflictsi_512_mask ((__v16si) __A, + (__v16si) _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_conflict_epi64 (__m512i __A) +{ + return (__m512i) + __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A, + (__v8di) _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A) +{ + return (__m512i) + __builtin_ia32_vpconflictdi_512_mask ((__v8di) __A, + (__v8di) _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_lzcnt_epi64 (__m512i __A) +{ + return (__m512i) + __builtin_ia32_vplzcntq_512_mask ((__v8di) __A, + (__v8di) _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_vplzcntq_512_mask ((__v8di) __A, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A) +{ + return (__m512i) + __builtin_ia32_vplzcntq_512_mask ((__v8di) __A, + (__v8di) _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_lzcnt_epi32 (__m512i __A) +{ + return (__m512i) + __builtin_ia32_vplzcntd_512_mask ((__v16si) __A, + (__v16si) _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_vplzcntd_512_mask ((__v16si) __A, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A) +{ + return (__m512i) + __builtin_ia32_vplzcntd_512_mask ((__v16si) __A, + (__v16si) _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcastmb_epi64 (__mmask8 __A) +{ + return (__m512i) __builtin_ia32_broadcastmb512 (__A); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcastmw_epi32 (__mmask16 __A) +{ + return (__m512i) __builtin_ia32_broadcastmw512 (__A); +} + +#ifdef __DISABLE_AVX512CD__ +#undef __DISABLE_AVX512CD__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512CD__ */ + +#endif /* _AVX512CDINTRIN_H_INCLUDED */ diff --git a/include-gcc/avx512dqintrin.h b/include-gcc/avx512dqintrin.h new file mode 100644 index 0000000..93900a0 --- /dev/null +++ b/include-gcc/avx512dqintrin.h @@ -0,0 +1,2891 @@ +/* Copyright (C) 2014-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512DQINTRIN_H_INCLUDED +#define _AVX512DQINTRIN_H_INCLUDED + +#ifndef __AVX512DQ__ +#pragma GCC push_options +#pragma GCC target("avx512dq") +#define __DISABLE_AVX512DQ__ +#endif /* __AVX512DQ__ */ + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_ktest_mask8_u8 (__mmask8 __A, __mmask8 __B, unsigned char *__CF) +{ + *__CF = (unsigned char) __builtin_ia32_ktestcqi (__A, __B); + return (unsigned char) __builtin_ia32_ktestzqi (__A, __B); +} + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_ktestz_mask8_u8 (__mmask8 __A, __mmask8 __B) +{ + return (unsigned char) __builtin_ia32_ktestzqi (__A, __B); +} + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_ktestc_mask8_u8 (__mmask8 __A, __mmask8 __B) +{ + return (unsigned char) __builtin_ia32_ktestcqi (__A, __B); +} + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_ktest_mask16_u8 (__mmask16 __A, __mmask16 __B, unsigned char *__CF) +{ + *__CF = (unsigned char) __builtin_ia32_ktestchi (__A, __B); + return (unsigned char) __builtin_ia32_ktestzhi (__A, __B); +} + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_ktestz_mask16_u8 (__mmask16 __A, __mmask16 __B) +{ + return (unsigned char) __builtin_ia32_ktestzhi (__A, __B); +} + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_ktestc_mask16_u8 (__mmask16 __A, __mmask16 __B) +{ + return (unsigned char) __builtin_ia32_ktestchi (__A, __B); +} + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortest_mask8_u8 (__mmask8 __A, __mmask8 __B, unsigned char *__CF) +{ + *__CF = (unsigned char) __builtin_ia32_kortestcqi (__A, __B); + return (unsigned char) __builtin_ia32_kortestzqi (__A, __B); +} + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortestz_mask8_u8 (__mmask8 __A, __mmask8 __B) +{ + return (unsigned char) __builtin_ia32_kortestzqi (__A, __B); +} + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortestc_mask8_u8 (__mmask8 __A, __mmask8 __B) +{ + return (unsigned char) __builtin_ia32_kortestcqi (__A, __B); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kadd_mask8 (__mmask8 __A, __mmask8 __B) +{ + return (__mmask8) __builtin_ia32_kaddqi ((__mmask8) __A, (__mmask8) __B); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kadd_mask16 (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_kaddhi ((__mmask16) __A, (__mmask16) __B); +} + +extern __inline unsigned int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_cvtmask8_u32 (__mmask8 __A) +{ + return (unsigned int) __builtin_ia32_kmovb ((__mmask8 ) __A); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_cvtu32_mask8 (unsigned int __A) +{ + return (__mmask8) __builtin_ia32_kmovb ((__mmask8) __A); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_load_mask8 (__mmask8 *__A) +{ + return (__mmask8) __builtin_ia32_kmovb (*(__mmask8 *) __A); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_store_mask8 (__mmask8 *__A, __mmask8 __B) +{ + *(__mmask8 *) __A = __builtin_ia32_kmovb (__B); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_knot_mask8 (__mmask8 __A) +{ + return (__mmask8) __builtin_ia32_knotqi ((__mmask8) __A); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kor_mask8 (__mmask8 __A, __mmask8 __B) +{ + return (__mmask8) __builtin_ia32_korqi ((__mmask8) __A, (__mmask8) __B); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kxnor_mask8 (__mmask8 __A, __mmask8 __B) +{ + return (__mmask8) __builtin_ia32_kxnorqi ((__mmask8) __A, (__mmask8) __B); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kxor_mask8 (__mmask8 __A, __mmask8 __B) +{ + return (__mmask8) __builtin_ia32_kxorqi ((__mmask8) __A, (__mmask8) __B); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kand_mask8 (__mmask8 __A, __mmask8 __B) +{ + return (__mmask8) __builtin_ia32_kandqi ((__mmask8) __A, (__mmask8) __B); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kandn_mask8 (__mmask8 __A, __mmask8 __B) +{ + return (__mmask8) __builtin_ia32_kandnqi ((__mmask8) __A, (__mmask8) __B); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcast_f64x2 (__m128d __A) +{ + return (__m512d) + __builtin_ia32_broadcastf64x2_512_mask ((__v2df) __A, + _mm512_undefined_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcast_f64x2 (__m512d __O, __mmask8 __M, __m128d __A) +{ + return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) + __A, + (__v8df) + __O, __M); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A) +{ + return (__m512d) __builtin_ia32_broadcastf64x2_512_mask ((__v2df) + __A, + (__v8df) + _mm512_setzero_ps (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcast_i64x2 (__m128i __A) +{ + return (__m512i) + __builtin_ia32_broadcasti64x2_512_mask ((__v2di) __A, + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcast_i64x2 (__m512i __O, __mmask8 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) + __A, + (__v8di) + __O, __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_broadcasti64x2_512_mask ((__v2di) + __A, + (__v8di) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcast_f32x2 (__m128 __A) +{ + return (__m512) + __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A, + (__v16sf)_mm512_undefined_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcast_f32x2 (__m512 __O, __mmask16 __M, __m128 __A) +{ + return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A, + (__v16sf) + __O, __M); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A) +{ + return (__m512) __builtin_ia32_broadcastf32x2_512_mask ((__v4sf) __A, + (__v16sf) + _mm512_setzero_ps (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcast_i32x2 (__m128i __A) +{ + return (__m512i) + __builtin_ia32_broadcasti32x2_512_mask ((__v4si) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcast_i32x2 (__m512i __O, __mmask16 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) + __A, + (__v16si) + __O, __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_broadcasti32x2_512_mask ((__v4si) + __A, + (__v16si) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcast_f32x8 (__m256 __A) +{ + return (__m512) + __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A, + _mm512_undefined_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcast_f32x8 (__m512 __O, __mmask16 __M, __m256 __A) +{ + return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A, + (__v16sf)__O, + __M); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcast_f32x8 (__mmask16 __M, __m256 __A) +{ + return (__m512) __builtin_ia32_broadcastf32x8_512_mask ((__v8sf) __A, + (__v16sf) + _mm512_setzero_ps (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcast_i32x8 (__m256i __A) +{ + return (__m512i) + __builtin_ia32_broadcasti32x8_512_mask ((__v8si) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcast_i32x8 (__m512i __O, __mmask16 __M, __m256i __A) +{ + return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) + __A, + (__v16si)__O, + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcast_i32x8 (__mmask16 __M, __m256i __A) +{ + return (__m512i) __builtin_ia32_broadcasti32x8_512_mask ((__v8si) + __A, + (__v16si) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mullo_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v8du) __A * (__v8du) __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mullo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mullo_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmullq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_xor_pd (__m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_xor_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B) +{ + return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_xor_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_xorpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_xor_ps (__m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_xor_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_xor_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_xorps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_or_pd (__m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_or_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_or_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_orpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_or_ps (__m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_or_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_or_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_orps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_and_pd (__m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_and_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B) +{ + return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_and_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_andpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_and_ps (__m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_and_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_and_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_andps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_andnot_pd (__m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_andnot_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B) +{ + return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_andnot_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_andnpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_andnot_ps (__m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_andnot_ps (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B) +{ + return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_andnot_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_andnps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_movepi32_mask (__m512i __A) +{ + return (__mmask16) __builtin_ia32_cvtd2mask512 ((__v16si) __A); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_movepi64_mask (__m512i __A) +{ + return (__mmask8) __builtin_ia32_cvtq2mask512 ((__v8di) __A); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_movm_epi32 (__mmask16 __A) +{ + return (__m512i) __builtin_ia32_cvtmask2d512 (__A); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_movm_epi64 (__mmask8 __A) +{ + return (__m512i) __builtin_ia32_cvtmask2q512 (__A); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvttpd_epi64 (__m512d __A) +{ + return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvttpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) +{ + return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A, + (__v8di) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvttpd_epi64 (__mmask8 __U, __m512d __A) +{ + return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvttpd_epu64 (__m512d __A) +{ + return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvttpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) +{ + return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A, + (__v8di) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvttpd_epu64 (__mmask8 __U, __m512d __A) +{ + return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvttps_epi64 (__m256 __A) +{ + return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvttps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) +{ + return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A, + (__v8di) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvttps_epi64 (__mmask8 __U, __m256 __A) +{ + return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvttps_epu64 (__m256 __A) +{ + return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvttps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) +{ + return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A, + (__v8di) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvttps_epu64 (__mmask8 __U, __m256 __A) +{ + return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtpd_epi64 (__m512d __A) +{ + return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) +{ + return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A, + (__v8di) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtpd_epi64 (__mmask8 __U, __m512d __A) +{ + return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtpd_epu64 (__m512d __A) +{ + return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) +{ + return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A, + (__v8di) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtpd_epu64 (__mmask8 __U, __m512d __A) +{ + return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtps_epi64 (__m256 __A) +{ + return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) +{ + return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A, + (__v8di) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtps_epi64 (__mmask8 __U, __m256 __A) +{ + return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtps_epu64 (__m256 __A) +{ + return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) +{ + return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A, + (__v8di) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtps_epu64 (__mmask8 __U, __m256 __A) +{ + return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi64_ps (__m512i __A) +{ + return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi64_ps (__m256 __W, __mmask8 __U, __m512i __A) +{ + return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A, + (__v8sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi64_ps (__mmask8 __U, __m512i __A) +{ + return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepu64_ps (__m512i __A) +{ + return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu64_ps (__m256 __W, __mmask8 __U, __m512i __A) +{ + return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A, + (__v8sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu64_ps (__mmask8 __U, __m512i __A) +{ + return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi64_pd (__m512i __A) +{ + return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi64_pd (__m512d __W, __mmask8 __U, __m512i __A) +{ + return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi64_pd (__mmask8 __U, __m512i __A) +{ + return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepu64_pd (__m512i __A) +{ + return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu64_pd (__m512d __W, __mmask8 __U, __m512i __A) +{ + return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu64_pd (__mmask8 __U, __m512i __A) +{ + return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kshiftli_mask8 (__mmask8 __A, unsigned int __B) +{ + return (__mmask8) __builtin_ia32_kshiftliqi ((__mmask8) __A, (__mmask8) __B); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kshiftri_mask8 (__mmask8 __A, unsigned int __B) +{ + return (__mmask8) __builtin_ia32_kshiftriqi ((__mmask8) __A, (__mmask8) __B); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_range_pd (__m512d __A, __m512d __B, int __C) +{ + return (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, + (__v8df) __B, __C, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_range_pd (__m512d __W, __mmask8 __U, + __m512d __A, __m512d __B, int __C) +{ + return (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, + (__v8df) __B, __C, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_range_pd (__mmask8 __U, __m512d __A, __m512d __B, int __C) +{ + return (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, + (__v8df) __B, __C, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_range_ps (__m512 __A, __m512 __B, int __C) +{ + return (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, + (__v16sf) __B, __C, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_range_ps (__m512 __W, __mmask16 __U, + __m512 __A, __m512 __B, int __C) +{ + return (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, + (__v16sf) __B, __C, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_range_ps (__mmask16 __U, __m512 __A, __m512 __B, int __C) +{ + return (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, + (__v16sf) __B, __C, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_sd (__m128d __A, __m128d __B, int __C) +{ + return (__m128d) __builtin_ia32_reducesd_mask ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) _mm_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_round_sd (__m128d __A, __m128d __B, int __C, const int __R) +{ + return (__m128d) __builtin_ia32_reducesd_mask_round ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, int __C) +{ + return (__m128d) __builtin_ia32_reducesd_mask ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, int __C, const int __R) +{ + return (__m128d) __builtin_ia32_reducesd_mask_round ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) __W, + __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_reduce_sd (__mmask8 __U, __m128d __A, __m128d __B, int __C) +{ + return (__m128d) __builtin_ia32_reducesd_mask ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_reduce_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + int __C, const int __R) +{ + return (__m128d) __builtin_ia32_reducesd_mask_round ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) + _mm_setzero_pd (), + __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_ss (__m128 __A, __m128 __B, int __C) +{ + return (__m128) __builtin_ia32_reducess_mask ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) _mm_setzero_ps (), + (__mmask8) -1); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_round_ss (__m128 __A, __m128 __B, int __C, const int __R) +{ + return (__m128) __builtin_ia32_reducess_mask_round ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, int __C) +{ + return (__m128) __builtin_ia32_reducess_mask ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, int __C, const int __R) +{ + return (__m128) __builtin_ia32_reducess_mask_round ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) __W, + __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_reduce_ss (__mmask8 __U, __m128 __A, __m128 __B, int __C) +{ + return (__m128) __builtin_ia32_reducess_mask ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_reduce_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + int __C, const int __R) +{ + return (__m128) __builtin_ia32_reducess_mask_round ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) + _mm_setzero_ps (), + __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_range_sd (__m128d __A, __m128d __B, int __C) +{ + return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_range_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, int __C) +{ + return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_range_sd (__mmask8 __U, __m128d __A, __m128d __B, int __C) +{ + return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_range_ss (__m128 __A, __m128 __B, int __C) +{ + return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_range_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, int __C) +{ + return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_range_ss (__mmask8 __U, __m128 __A, __m128 __B, int __C) +{ + return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_range_round_sd (__m128d __A, __m128d __B, int __C, const int __R) +{ + return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_range_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + int __C, const int __R) +{ + return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_range_round_sd (__mmask8 __U, __m128d __A, __m128d __B, int __C, + const int __R) +{ + return (__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_range_round_ss (__m128 __A, __m128 __B, int __C, const int __R) +{ + return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_range_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + int __C, const int __R) +{ + return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_range_round_ss (__mmask8 __U, __m128 __A, __m128 __B, int __C, + const int __R) +{ + return (__m128) __builtin_ia32_rangess128_mask_round ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fpclass_ss_mask (__m128 __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclassss_mask ((__v4sf) __A, __imm, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fpclass_sd_mask (__m128d __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) __A, __imm, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fpclass_ss_mask (__mmask8 __U, __m128 __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclassss_mask ((__v4sf) __A, __imm, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fpclass_sd_mask (__mmask8 __U, __m128d __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) __A, __imm, __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtt_roundpd_epi64 (__m512d __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtt_roundpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A, + (__v8di) __W, + (__mmask8) __U, + __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtt_roundpd_epi64 (__mmask8 __U, __m512d __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtt_roundpd_epu64 (__m512d __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtt_roundpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A, + (__v8di) __W, + (__mmask8) __U, + __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtt_roundpd_epu64 (__mmask8 __U, __m512d __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtt_roundps_epi64 (__m256 __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtt_roundps_epi64 (__m512i __W, __mmask8 __U, __m256 __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A, + (__v8di) __W, + (__mmask8) __U, + __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtt_roundps_epi64 (__mmask8 __U, __m256 __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtt_roundps_epu64 (__m256 __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtt_roundps_epu64 (__m512i __W, __mmask8 __U, __m256 __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A, + (__v8di) __W, + (__mmask8) __U, + __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtt_roundps_epu64 (__mmask8 __U, __m256 __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundpd_epi64 (__m512d __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A, + (__v8di) __W, + (__mmask8) __U, + __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundpd_epi64 (__mmask8 __U, __m512d __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundpd_epu64 (__m512d __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A, + (__v8di) __W, + (__mmask8) __U, + __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundpd_epu64 (__mmask8 __U, __m512d __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundps_epi64 (__m256 __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundps_epi64 (__m512i __W, __mmask8 __U, __m256 __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A, + (__v8di) __W, + (__mmask8) __U, + __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundps_epi64 (__mmask8 __U, __m256 __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundps_epu64 (__m256 __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1, + __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundps_epu64 (__m512i __W, __mmask8 __U, __m256 __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A, + (__v8di) __W, + (__mmask8) __U, + __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundps_epu64 (__mmask8 __U, __m256 __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U, + __R); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundepi64_ps (__m512i __A, const int __R) +{ + return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1, + __R); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundepi64_ps (__m256 __W, __mmask8 __U, __m512i __A, + const int __R) +{ + return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A, + (__v8sf) __W, + (__mmask8) __U, + __R); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundepi64_ps (__mmask8 __U, __m512i __A, + const int __R) +{ + return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U, + __R); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundepu64_ps (__m512i __A, const int __R) +{ + return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1, + __R); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundepu64_ps (__m256 __W, __mmask8 __U, __m512i __A, + const int __R) +{ + return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A, + (__v8sf) __W, + (__mmask8) __U, + __R); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundepu64_ps (__mmask8 __U, __m512i __A, + const int __R) +{ + return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U, + __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundepi64_pd (__m512i __A, const int __R) +{ + return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, + __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundepi64_pd (__m512d __W, __mmask8 __U, __m512i __A, + const int __R) +{ + return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A, + (__v8df) __W, + (__mmask8) __U, + __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundepi64_pd (__mmask8 __U, __m512i __A, + const int __R) +{ + return (__m512d) __builtin_ia32_cvtqq2pd512_mask ((__v8di) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundepu64_pd (__m512i __A, const int __R) +{ + return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, + __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundepu64_pd (__m512d __W, __mmask8 __U, __m512i __A, + const int __R) +{ + return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A, + (__v8df) __W, + (__mmask8) __U, + __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundepu64_pd (__mmask8 __U, __m512i __A, + const int __R) +{ + return (__m512d) __builtin_ia32_cvtuqq2pd512_mask ((__v8di) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_pd (__m512d __A, int __B) +{ + return (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_round_pd (__m512d __A, int __B, const int __R) +{ + return (__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df) __A, + __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_pd (__m512d __W, __mmask8 __U, __m512d __A, int __B) +{ + return (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + int __B, const int __R) +{ + return (__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df) __A, + __B, + (__v8df) __W, + __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_reduce_pd (__mmask8 __U, __m512d __A, int __B) +{ + return (__m512d) __builtin_ia32_reducepd512_mask ((__v8df) __A, __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_reduce_round_pd (__mmask8 __U, __m512d __A, int __B, + const int __R) +{ + return (__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df) __A, + __B, + (__v8df) + _mm512_setzero_pd (), + __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_ps (__m512 __A, int __B) +{ + return (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_round_ps (__m512 __A, int __B, const int __R) +{ + return (__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf) __A, + __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_ps (__m512 __W, __mmask16 __U, __m512 __A, int __B) +{ + return (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_round_ps (__m512 __W, __mmask16 __U, __m512 __A, int __B, + const int __R) +{ + return (__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf) __A, + __B, + (__v16sf) __W, + __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_reduce_ps (__mmask16 __U, __m512 __A, int __B) +{ + return (__m512) __builtin_ia32_reduceps512_mask ((__v16sf) __A, __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_reduce_round_ps (__mmask16 __U, __m512 __A, int __B, + const int __R) +{ + return (__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf) __A, + __B, + (__v16sf) + _mm512_setzero_ps (), + __U, __R); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_extractf32x8_ps (__m512 __A, const int __imm) +{ + return (__m256) __builtin_ia32_extractf32x8_mask ((__v16sf) __A, + __imm, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_extractf32x8_ps (__m256 __W, __mmask8 __U, __m512 __A, + const int __imm) +{ + return (__m256) __builtin_ia32_extractf32x8_mask ((__v16sf) __A, + __imm, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_extractf32x8_ps (__mmask8 __U, __m512 __A, + const int __imm) +{ + return (__m256) __builtin_ia32_extractf32x8_mask ((__v16sf) __A, + __imm, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_extractf64x2_pd (__m512d __A, const int __imm) +{ + return (__m128d) __builtin_ia32_extractf64x2_512_mask ((__v8df) __A, + __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_extractf64x2_pd (__m128d __W, __mmask8 __U, __m512d __A, + const int __imm) +{ + return (__m128d) __builtin_ia32_extractf64x2_512_mask ((__v8df) __A, + __imm, + (__v2df) __W, + (__mmask8) + __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_extractf64x2_pd (__mmask8 __U, __m512d __A, + const int __imm) +{ + return (__m128d) __builtin_ia32_extractf64x2_512_mask ((__v8df) __A, + __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) + __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_extracti32x8_epi32 (__m512i __A, const int __imm) +{ + return (__m256i) __builtin_ia32_extracti32x8_mask ((__v16si) __A, + __imm, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_extracti32x8_epi32 (__m256i __W, __mmask8 __U, __m512i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_extracti32x8_mask ((__v16si) __A, + __imm, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_extracti32x8_epi32 (__mmask8 __U, __m512i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_extracti32x8_mask ((__v16si) __A, + __imm, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_extracti64x2_epi64 (__m512i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_extracti64x2_512_mask ((__v8di) __A, + __imm, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_extracti64x2_epi64 (__m128i __W, __mmask8 __U, __m512i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_extracti64x2_512_mask ((__v8di) __A, + __imm, + (__v2di) __W, + (__mmask8) + __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_extracti64x2_epi64 (__mmask8 __U, __m512i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_extracti64x2_512_mask ((__v8di) __A, + __imm, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) + __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_range_round_pd (__m512d __A, __m512d __B, int __C, + const int __R) +{ + return (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, + (__v8df) __B, __C, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1, + __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_range_round_pd (__m512d __W, __mmask8 __U, + __m512d __A, __m512d __B, int __C, + const int __R) +{ + return (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, + (__v8df) __B, __C, + (__v8df) __W, + (__mmask8) __U, + __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_range_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + int __C, const int __R) +{ + return (__m512d) __builtin_ia32_rangepd512_mask ((__v8df) __A, + (__v8df) __B, __C, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_range_round_ps (__m512 __A, __m512 __B, int __C, const int __R) +{ + return (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, + (__v16sf) __B, __C, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, + __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_range_round_ps (__m512 __W, __mmask16 __U, + __m512 __A, __m512 __B, int __C, + const int __R) +{ + return (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, + (__v16sf) __B, __C, + (__v16sf) __W, + (__mmask16) __U, + __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_range_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + int __C, const int __R) +{ + return (__m512) __builtin_ia32_rangeps512_mask ((__v16sf) __A, + (__v16sf) __B, __C, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_inserti32x8 (__m512i __A, __m256i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_inserti32x8_mask ((__v16si) __A, + (__v8si) __B, + __imm, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_inserti32x8 (__m512i __W, __mmask16 __U, __m512i __A, + __m256i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_inserti32x8_mask ((__v16si) __A, + (__v8si) __B, + __imm, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_inserti32x8 (__mmask16 __U, __m512i __A, __m256i __B, + const int __imm) +{ + return (__m512i) __builtin_ia32_inserti32x8_mask ((__v16si) __A, + (__v8si) __B, + __imm, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_insertf32x8 (__m512 __A, __m256 __B, const int __imm) +{ + return (__m512) __builtin_ia32_insertf32x8_mask ((__v16sf) __A, + (__v8sf) __B, + __imm, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_insertf32x8 (__m512 __W, __mmask16 __U, __m512 __A, + __m256 __B, const int __imm) +{ + return (__m512) __builtin_ia32_insertf32x8_mask ((__v16sf) __A, + (__v8sf) __B, + __imm, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_insertf32x8 (__mmask16 __U, __m512 __A, __m256 __B, + const int __imm) +{ + return (__m512) __builtin_ia32_insertf32x8_mask ((__v16sf) __A, + (__v8sf) __B, + __imm, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_inserti64x2 (__m512i __A, __m128i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_inserti64x2_512_mask ((__v8di) __A, + (__v2di) __B, + __imm, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_inserti64x2 (__m512i __W, __mmask8 __U, __m512i __A, + __m128i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_inserti64x2_512_mask ((__v8di) __A, + (__v2di) __B, + __imm, + (__v8di) __W, + (__mmask8) + __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_inserti64x2 (__mmask8 __U, __m512i __A, __m128i __B, + const int __imm) +{ + return (__m512i) __builtin_ia32_inserti64x2_512_mask ((__v8di) __A, + (__v2di) __B, + __imm, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) + __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_insertf64x2 (__m512d __A, __m128d __B, const int __imm) +{ + return (__m512d) __builtin_ia32_insertf64x2_512_mask ((__v8df) __A, + (__v2df) __B, + __imm, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_insertf64x2 (__m512d __W, __mmask8 __U, __m512d __A, + __m128d __B, const int __imm) +{ + return (__m512d) __builtin_ia32_insertf64x2_512_mask ((__v8df) __A, + (__v2df) __B, + __imm, + (__v8df) __W, + (__mmask8) + __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_insertf64x2 (__mmask8 __U, __m512d __A, __m128d __B, + const int __imm) +{ + return (__m512d) __builtin_ia32_insertf64x2_512_mask ((__v8df) __A, + (__v2df) __B, + __imm, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) + __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fpclass_pd_mask (__mmask8 __U, __m512d __A, + const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclasspd512_mask ((__v8df) __A, + __imm, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fpclass_pd_mask (__m512d __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclasspd512_mask ((__v8df) __A, + __imm, + (__mmask8) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fpclass_ps_mask (__mmask16 __U, __m512 __A, + const int __imm) +{ + return (__mmask16) __builtin_ia32_fpclassps512_mask ((__v16sf) __A, + __imm, __U); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fpclass_ps_mask (__m512 __A, const int __imm) +{ + return (__mmask16) __builtin_ia32_fpclassps512_mask ((__v16sf) __A, + __imm, + (__mmask16) -1); +} + +#else +#define _kshiftli_mask8(X, Y) \ + ((__mmask8) __builtin_ia32_kshiftliqi ((__mmask8)(X), (__mmask8)(Y))) + +#define _kshiftri_mask8(X, Y) \ + ((__mmask8) __builtin_ia32_kshiftriqi ((__mmask8)(X), (__mmask8)(Y))) + +#define _mm_range_sd(A, B, C) \ + ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), \ + (__mmask8) -1, _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_range_sd(W, U, A, B, C) \ + ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W), \ + (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_range_sd(U, A, B, C) \ + ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), \ + (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_range_ss(A, B, C) \ + ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), \ + (__mmask8) -1, _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_range_ss(W, U, A, B, C) \ + ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), \ + (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_range_ss(U, A, B, C) \ + ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), \ + (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_range_round_sd(A, B, C, R) \ + ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), \ + (__mmask8) -1, (R))) + +#define _mm_mask_range_round_sd(W, U, A, B, C, R) \ + ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W), \ + (__mmask8)(U), (R))) + +#define _mm_maskz_range_round_sd(U, A, B, C, R) \ + ((__m128d) __builtin_ia32_rangesd128_mask_round ((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), \ + (__mmask8)(U), (R))) + +#define _mm_range_round_ss(A, B, C, R) \ + ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), \ + (__mmask8) -1, (R))) + +#define _mm_mask_range_round_ss(W, U, A, B, C, R) \ + ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), \ + (__mmask8)(U), (R))) + +#define _mm_maskz_range_round_ss(U, A, B, C, R) \ + ((__m128) __builtin_ia32_rangess128_mask_round ((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), \ + (__mmask8)(U), (R))) + +#define _mm512_cvtt_roundpd_epi64(A, B) \ + ((__m512i)__builtin_ia32_cvttpd2qq512_mask ((A), (__v8di) \ + _mm512_setzero_si512 (), \ + -1, (B))) + +#define _mm512_mask_cvtt_roundpd_epi64(W, U, A, B) \ + ((__m512i)__builtin_ia32_cvttpd2qq512_mask ((A), (__v8di)(W), (U), (B))) + +#define _mm512_maskz_cvtt_roundpd_epi64(U, A, B) \ + ((__m512i)__builtin_ia32_cvttpd2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B))) + +#define _mm512_cvtt_roundpd_epu64(A, B) \ + ((__m512i)__builtin_ia32_cvttpd2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B))) + +#define _mm512_mask_cvtt_roundpd_epu64(W, U, A, B) \ + ((__m512i)__builtin_ia32_cvttpd2uqq512_mask ((A), (__v8di)(W), (U), (B))) + +#define _mm512_maskz_cvtt_roundpd_epu64(U, A, B) \ + ((__m512i)__builtin_ia32_cvttpd2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B))) + +#define _mm512_cvtt_roundps_epi64(A, B) \ + ((__m512i)__builtin_ia32_cvttps2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B))) + +#define _mm512_mask_cvtt_roundps_epi64(W, U, A, B) \ + ((__m512i)__builtin_ia32_cvttps2qq512_mask ((A), (__v8di)(W), (U), (B))) + +#define _mm512_maskz_cvtt_roundps_epi64(U, A, B) \ + ((__m512i)__builtin_ia32_cvttps2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B))) + +#define _mm512_cvtt_roundps_epu64(A, B) \ + ((__m512i)__builtin_ia32_cvttps2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B))) + +#define _mm512_mask_cvtt_roundps_epu64(W, U, A, B) \ + ((__m512i)__builtin_ia32_cvttps2uqq512_mask ((A), (__v8di)(W), (U), (B))) + +#define _mm512_maskz_cvtt_roundps_epu64(U, A, B) \ + ((__m512i)__builtin_ia32_cvttps2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B))) + +#define _mm512_cvt_roundpd_epi64(A, B) \ + ((__m512i)__builtin_ia32_cvtpd2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B))) + +#define _mm512_mask_cvt_roundpd_epi64(W, U, A, B) \ + ((__m512i)__builtin_ia32_cvtpd2qq512_mask ((A), (__v8di)(W), (U), (B))) + +#define _mm512_maskz_cvt_roundpd_epi64(U, A, B) \ + ((__m512i)__builtin_ia32_cvtpd2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B))) + +#define _mm512_cvt_roundpd_epu64(A, B) \ + ((__m512i)__builtin_ia32_cvtpd2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B))) + +#define _mm512_mask_cvt_roundpd_epu64(W, U, A, B) \ + ((__m512i)__builtin_ia32_cvtpd2uqq512_mask ((A), (__v8di)(W), (U), (B))) + +#define _mm512_maskz_cvt_roundpd_epu64(U, A, B) \ + ((__m512i)__builtin_ia32_cvtpd2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B))) + +#define _mm512_cvt_roundps_epi64(A, B) \ + ((__m512i)__builtin_ia32_cvtps2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B))) + +#define _mm512_mask_cvt_roundps_epi64(W, U, A, B) \ + ((__m512i)__builtin_ia32_cvtps2qq512_mask ((A), (__v8di)(W), (U), (B))) + +#define _mm512_maskz_cvt_roundps_epi64(U, A, B) \ + ((__m512i)__builtin_ia32_cvtps2qq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B))) + +#define _mm512_cvt_roundps_epu64(A, B) \ + ((__m512i)__builtin_ia32_cvtps2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), -1, (B))) + +#define _mm512_mask_cvt_roundps_epu64(W, U, A, B) \ + ((__m512i)__builtin_ia32_cvtps2uqq512_mask ((A), (__v8di)(W), (U), (B))) + +#define _mm512_maskz_cvt_roundps_epu64(U, A, B) \ + ((__m512i)__builtin_ia32_cvtps2uqq512_mask ((A), (__v8di)_mm512_setzero_si512 (), (U), (B))) + +#define _mm512_cvt_roundepi64_ps(A, B) \ + ((__m256)__builtin_ia32_cvtqq2ps512_mask ((__v8di)(A), (__v8sf)_mm256_setzero_ps (), -1, (B))) + +#define _mm512_mask_cvt_roundepi64_ps(W, U, A, B) \ + ((__m256)__builtin_ia32_cvtqq2ps512_mask ((__v8di)(A), (W), (U), (B))) + +#define _mm512_maskz_cvt_roundepi64_ps(U, A, B) \ + ((__m256)__builtin_ia32_cvtqq2ps512_mask ((__v8di)(A), (__v8sf)_mm256_setzero_ps (), (U), (B))) + +#define _mm512_cvt_roundepu64_ps(A, B) \ + ((__m256)__builtin_ia32_cvtuqq2ps512_mask ((__v8di)(A), (__v8sf)_mm256_setzero_ps (), -1, (B))) + +#define _mm512_mask_cvt_roundepu64_ps(W, U, A, B) \ + ((__m256)__builtin_ia32_cvtuqq2ps512_mask ((__v8di)(A), (W), (U), (B))) + +#define _mm512_maskz_cvt_roundepu64_ps(U, A, B) \ + ((__m256)__builtin_ia32_cvtuqq2ps512_mask ((__v8di)(A), (__v8sf)_mm256_setzero_ps (), (U), (B))) + +#define _mm512_cvt_roundepi64_pd(A, B) \ + ((__m512d)__builtin_ia32_cvtqq2pd512_mask ((__v8di)(A), (__v8df)_mm512_setzero_pd (), -1, (B))) + +#define _mm512_mask_cvt_roundepi64_pd(W, U, A, B) \ + ((__m512d)__builtin_ia32_cvtqq2pd512_mask ((__v8di)(A), (W), (U), (B))) + +#define _mm512_maskz_cvt_roundepi64_pd(U, A, B) \ + ((__m512d)__builtin_ia32_cvtqq2pd512_mask ((__v8di)(A), (__v8df)_mm512_setzero_pd (), (U), (B))) + +#define _mm512_cvt_roundepu64_pd(A, B) \ + ((__m512d)__builtin_ia32_cvtuqq2pd512_mask ((__v8di)(A), (__v8df)_mm512_setzero_pd (), -1, (B))) + +#define _mm512_mask_cvt_roundepu64_pd(W, U, A, B) \ + ((__m512d)__builtin_ia32_cvtuqq2pd512_mask ((__v8di)(A), (W), (U), (B))) + +#define _mm512_maskz_cvt_roundepu64_pd(U, A, B) \ + ((__m512d)__builtin_ia32_cvtuqq2pd512_mask ((__v8di)(A), (__v8df)_mm512_setzero_pd (), (U), (B))) + +#define _mm512_reduce_pd(A, B) \ + ((__m512d) __builtin_ia32_reducepd512_mask ((__v8df)(__m512d)(A), \ + (int)(B), (__v8df)_mm512_setzero_pd (), (__mmask8)-1)) + +#define _mm512_reduce_round_pd(A, B, R) \ + ((__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df)(__m512d)(A),\ + (int)(B), (__v8df)_mm512_setzero_pd (), (__mmask8)-1, (R))) + +#define _mm512_mask_reduce_pd(W, U, A, B) \ + ((__m512d) __builtin_ia32_reducepd512_mask ((__v8df)(__m512d)(A), \ + (int)(B), (__v8df)(__m512d)(W), (__mmask8)(U))) + +#define _mm512_mask_reduce_round_pd(W, U, A, B, R) \ + ((__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df)(__m512d)(A),\ + (int)(B), (__v8df)(__m512d)(W), (U), (R))) + +#define _mm512_maskz_reduce_pd(U, A, B) \ + ((__m512d) __builtin_ia32_reducepd512_mask ((__v8df)(__m512d)(A), \ + (int)(B), (__v8df)_mm512_setzero_pd (), (__mmask8)(U))) + +#define _mm512_maskz_reduce_round_pd(U, A, B, R) \ + ((__m512d) __builtin_ia32_reducepd512_mask_round ((__v8df)(__m512d)(A),\ + (int)(B), (__v8df)_mm512_setzero_pd (), (U), (R))) + +#define _mm512_reduce_ps(A, B) \ + ((__m512) __builtin_ia32_reduceps512_mask ((__v16sf)(__m512)(A), \ + (int)(B), (__v16sf)_mm512_setzero_ps (), (__mmask16)-1)) + +#define _mm512_reduce_round_ps(A, B, R) \ + ((__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf)(__m512)(A),\ + (int)(B), (__v16sf)_mm512_setzero_ps (), (__mmask16)-1, (R))) + +#define _mm512_mask_reduce_ps(W, U, A, B) \ + ((__m512) __builtin_ia32_reduceps512_mask ((__v16sf)(__m512)(A), \ + (int)(B), (__v16sf)(__m512)(W), (__mmask16)(U))) + +#define _mm512_mask_reduce_round_ps(W, U, A, B, R) \ + ((__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf)(__m512)(A),\ + (int)(B), (__v16sf)(__m512)(W), (U), (R))) + +#define _mm512_maskz_reduce_ps(U, A, B) \ + ((__m512) __builtin_ia32_reduceps512_mask ((__v16sf)(__m512)(A), \ + (int)(B), (__v16sf)_mm512_setzero_ps (), (__mmask16)(U))) + +#define _mm512_maskz_reduce_round_ps(U, A, B, R) \ + ((__m512) __builtin_ia32_reduceps512_mask_round ((__v16sf)(__m512)(A),\ + (int)(B), (__v16sf)_mm512_setzero_ps (), (__mmask16)(U), (R))) + +#define _mm512_extractf32x8_ps(X, C) \ + ((__m256) __builtin_ia32_extractf32x8_mask ((__v16sf)(__m512) (X), \ + (int) (C), (__v8sf)(__m256) _mm256_setzero_ps (), (__mmask8)-1)) + +#define _mm512_mask_extractf32x8_ps(W, U, X, C) \ + ((__m256) __builtin_ia32_extractf32x8_mask ((__v16sf)(__m512) (X), \ + (int) (C), (__v8sf)(__m256) (W), (__mmask8) (U))) + +#define _mm512_maskz_extractf32x8_ps(U, X, C) \ + ((__m256) __builtin_ia32_extractf32x8_mask ((__v16sf)(__m512) (X), \ + (int) (C), (__v8sf)(__m256) _mm256_setzero_ps (), (__mmask8) (U))) + +#define _mm512_extractf64x2_pd(X, C) \ + ((__m128d) __builtin_ia32_extractf64x2_512_mask ((__v8df)(__m512d) (X),\ + (int) (C), (__v2df)(__m128d) _mm_setzero_pd (), (__mmask8)-1)) + +#define _mm512_mask_extractf64x2_pd(W, U, X, C) \ + ((__m128d) __builtin_ia32_extractf64x2_512_mask ((__v8df)(__m512d) (X),\ + (int) (C), (__v2df)(__m128d) (W), (__mmask8) (U))) + +#define _mm512_maskz_extractf64x2_pd(U, X, C) \ + ((__m128d) __builtin_ia32_extractf64x2_512_mask ((__v8df)(__m512d) (X),\ + (int) (C), (__v2df)(__m128d) _mm_setzero_pd (), (__mmask8) (U))) + +#define _mm512_extracti32x8_epi32(X, C) \ + ((__m256i) __builtin_ia32_extracti32x8_mask ((__v16si)(__m512i) (X), \ + (int) (C), (__v8si)(__m256i) _mm256_setzero_si256 (), (__mmask8)-1)) + +#define _mm512_mask_extracti32x8_epi32(W, U, X, C) \ + ((__m256i) __builtin_ia32_extracti32x8_mask ((__v16si)(__m512i) (X), \ + (int) (C), (__v8si)(__m256i) (W), (__mmask8) (U))) + +#define _mm512_maskz_extracti32x8_epi32(U, X, C) \ + ((__m256i) __builtin_ia32_extracti32x8_mask ((__v16si)(__m512i) (X), \ + (int) (C), (__v8si)(__m256i) _mm256_setzero_si256 (), (__mmask8) (U))) + +#define _mm512_extracti64x2_epi64(X, C) \ + ((__m128i) __builtin_ia32_extracti64x2_512_mask ((__v8di)(__m512i) (X),\ + (int) (C), (__v2di)(__m128i) _mm_setzero_si128 (), (__mmask8)-1)) + +#define _mm512_mask_extracti64x2_epi64(W, U, X, C) \ + ((__m128i) __builtin_ia32_extracti64x2_512_mask ((__v8di)(__m512i) (X),\ + (int) (C), (__v2di)(__m128i) (W), (__mmask8) (U))) + +#define _mm512_maskz_extracti64x2_epi64(U, X, C) \ + ((__m128i) __builtin_ia32_extracti64x2_512_mask ((__v8di)(__m512i) (X),\ + (int) (C), (__v2di)(__m128i) _mm_setzero_si128 (), (__mmask8) (U))) + +#define _mm512_range_pd(A, B, C) \ + ((__m512d) __builtin_ia32_rangepd512_mask ((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(C), \ + (__v8df)_mm512_setzero_pd (), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_range_pd(W, U, A, B, C) \ + ((__m512d) __builtin_ia32_rangepd512_mask ((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(C), \ + (__v8df)(__m512d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_range_pd(U, A, B, C) \ + ((__m512d) __builtin_ia32_rangepd512_mask ((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(C), \ + (__v8df)_mm512_setzero_pd (), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_range_ps(A, B, C) \ + ((__m512) __builtin_ia32_rangeps512_mask ((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(C), \ + (__v16sf)_mm512_setzero_ps (), (__mmask16)-1, _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_range_ps(W, U, A, B, C) \ + ((__m512) __builtin_ia32_rangeps512_mask ((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(C), \ + (__v16sf)(__m512)(W), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_range_ps(U, A, B, C) \ + ((__m512) __builtin_ia32_rangeps512_mask ((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(C), \ + (__v16sf)_mm512_setzero_ps (), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_range_round_pd(A, B, C, R) \ + ((__m512d) __builtin_ia32_rangepd512_mask ((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(C), \ + (__v8df)_mm512_setzero_pd (), (__mmask8)-1, (R))) + +#define _mm512_mask_range_round_pd(W, U, A, B, C, R) \ + ((__m512d) __builtin_ia32_rangepd512_mask ((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(C), \ + (__v8df)(__m512d)(W), (__mmask8)(U), (R))) + +#define _mm512_maskz_range_round_pd(U, A, B, C, R) \ + ((__m512d) __builtin_ia32_rangepd512_mask ((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(C), \ + (__v8df)_mm512_setzero_pd (), (__mmask8)(U), (R))) + +#define _mm512_range_round_ps(A, B, C, R) \ + ((__m512) __builtin_ia32_rangeps512_mask ((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(C), \ + (__v16sf)_mm512_setzero_ps (), (__mmask16)-1, (R))) + +#define _mm512_mask_range_round_ps(W, U, A, B, C, R) \ + ((__m512) __builtin_ia32_rangeps512_mask ((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(C), \ + (__v16sf)(__m512)(W), (__mmask16)(U), (R))) + +#define _mm512_maskz_range_round_ps(U, A, B, C, R) \ + ((__m512) __builtin_ia32_rangeps512_mask ((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(C), \ + (__v16sf)_mm512_setzero_ps (), (__mmask16)(U), (R))) + +#define _mm512_insertf64x2(X, Y, C) \ + ((__m512d) __builtin_ia32_insertf64x2_512_mask ((__v8df)(__m512d) (X),\ + (__v2df)(__m128d) (Y), (int) (C), (__v8df)(__m512d) (X), \ + (__mmask8)-1)) + +#define _mm512_mask_insertf64x2(W, U, X, Y, C) \ + ((__m512d) __builtin_ia32_insertf64x2_512_mask ((__v8df)(__m512d) (X),\ + (__v2df)(__m128d) (Y), (int) (C), (__v8df)(__m512d) (W), \ + (__mmask8) (U))) + +#define _mm512_maskz_insertf64x2(U, X, Y, C) \ + ((__m512d) __builtin_ia32_insertf64x2_512_mask ((__v8df)(__m512d) (X),\ + (__v2df)(__m128d) (Y), (int) (C), \ + (__v8df)(__m512d) _mm512_setzero_pd (), (__mmask8) (U))) + +#define _mm512_inserti64x2(X, Y, C) \ + ((__m512i) __builtin_ia32_inserti64x2_512_mask ((__v8di)(__m512i) (X),\ + (__v2di)(__m128i) (Y), (int) (C), (__v8di)(__m512i) (X), (__mmask8)-1)) + +#define _mm512_mask_inserti64x2(W, U, X, Y, C) \ + ((__m512i) __builtin_ia32_inserti64x2_512_mask ((__v8di)(__m512i) (X),\ + (__v2di)(__m128i) (Y), (int) (C), (__v8di)(__m512i) (W), \ + (__mmask8) (U))) + +#define _mm512_maskz_inserti64x2(U, X, Y, C) \ + ((__m512i) __builtin_ia32_inserti64x2_512_mask ((__v8di)(__m512i) (X),\ + (__v2di)(__m128i) (Y), (int) (C), \ + (__v8di)(__m512i) _mm512_setzero_si512 (), (__mmask8) (U))) + +#define _mm512_insertf32x8(X, Y, C) \ + ((__m512) __builtin_ia32_insertf32x8_mask ((__v16sf)(__m512) (X), \ + (__v8sf)(__m256) (Y), (int) (C),\ + (__v16sf)(__m512)_mm512_setzero_ps (),\ + (__mmask16)-1)) + +#define _mm512_mask_insertf32x8(W, U, X, Y, C) \ + ((__m512) __builtin_ia32_insertf32x8_mask ((__v16sf)(__m512) (X), \ + (__v8sf)(__m256) (Y), (int) (C),\ + (__v16sf)(__m512)(W),\ + (__mmask16)(U))) + +#define _mm512_maskz_insertf32x8(U, X, Y, C) \ + ((__m512) __builtin_ia32_insertf32x8_mask ((__v16sf)(__m512) (X), \ + (__v8sf)(__m256) (Y), (int) (C),\ + (__v16sf)(__m512)_mm512_setzero_ps (),\ + (__mmask16)(U))) + +#define _mm512_inserti32x8(X, Y, C) \ + ((__m512i) __builtin_ia32_inserti32x8_mask ((__v16si)(__m512i) (X), \ + (__v8si)(__m256i) (Y), (int) (C),\ + (__v16si)(__m512i)_mm512_setzero_si512 (),\ + (__mmask16)-1)) + +#define _mm512_mask_inserti32x8(W, U, X, Y, C) \ + ((__m512i) __builtin_ia32_inserti32x8_mask ((__v16si)(__m512i) (X), \ + (__v8si)(__m256i) (Y), (int) (C),\ + (__v16si)(__m512i)(W),\ + (__mmask16)(U))) + +#define _mm512_maskz_inserti32x8(U, X, Y, C) \ + ((__m512i) __builtin_ia32_inserti32x8_mask ((__v16si)(__m512i) (X), \ + (__v8si)(__m256i) (Y), (int) (C),\ + (__v16si)(__m512i)_mm512_setzero_si512 (),\ + (__mmask16)(U))) + +#define _mm_fpclass_ss_mask(X, C) \ + ((__mmask8) __builtin_ia32_fpclassss_mask ((__v4sf) (__m128) (X), \ + (int) (C), (__mmask8) (-1))) \ + +#define _mm_fpclass_sd_mask(X, C) \ + ((__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) (__m128d) (X), \ + (int) (C), (__mmask8) (-1))) \ + +#define _mm_mask_fpclass_ss_mask(X, C, U) \ + ((__mmask8) __builtin_ia32_fpclassss_mask ((__v4sf) (__m128) (X), \ + (int) (C), (__mmask8) (U))) + +#define _mm_mask_fpclass_sd_mask(X, C, U) \ + ((__mmask8) __builtin_ia32_fpclasssd_mask ((__v2df) (__m128d) (X), \ + (int) (C), (__mmask8) (U))) + +#define _mm512_mask_fpclass_pd_mask(u, X, C) \ + ((__mmask8) __builtin_ia32_fpclasspd512_mask ((__v8df) (__m512d) (X), \ + (int) (C), (__mmask8)(u))) + +#define _mm512_mask_fpclass_ps_mask(u, x, c) \ + ((__mmask16) __builtin_ia32_fpclassps512_mask ((__v16sf) (__m512) (x),\ + (int) (c),(__mmask16)(u))) + +#define _mm512_fpclass_pd_mask(X, C) \ + ((__mmask8) __builtin_ia32_fpclasspd512_mask ((__v8df) (__m512d) (X), \ + (int) (C), (__mmask8)-1)) + +#define _mm512_fpclass_ps_mask(x, c) \ + ((__mmask16) __builtin_ia32_fpclassps512_mask ((__v16sf) (__m512) (x),\ + (int) (c),(__mmask16)-1)) + +#define _mm_reduce_sd(A, B, C) \ + ((__m128d) __builtin_ia32_reducesd_mask ((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), \ + (__mmask8)-1)) + +#define _mm_mask_reduce_sd(W, U, A, B, C) \ + ((__m128d) __builtin_ia32_reducesd_mask ((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W), (__mmask8)(U))) + +#define _mm_maskz_reduce_sd(U, A, B, C) \ + ((__m128d) __builtin_ia32_reducesd_mask ((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), \ + (__mmask8)(U))) + +#define _mm_reduce_round_sd(A, B, C, R) \ + ((__m128d) __builtin_ia32_reducesd_round ((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), (__mmask8)(U), (int)(R))) + +#define _mm_mask_reduce_round_sd(W, U, A, B, C, R) \ + ((__m128d) __builtin_ia32_reducesd_mask_round ((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), (__v2df)(__m128d)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_reduce_round_sd(U, A, B, C, R) \ + ((__m128d) __builtin_ia32_reducesd_mask_round ((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), (__v2df) _mm_setzero_pd (), \ + (__mmask8)(U), (int)(R))) + +#define _mm_reduce_ss(A, B, C) \ + ((__m128) __builtin_ia32_reducess_mask ((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), \ + (__mmask8)-1)) + +#define _mm_mask_reduce_ss(W, U, A, B, C) \ + ((__m128) __builtin_ia32_reducess_mask ((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), (__mmask8)(U))) + +#define _mm_maskz_reduce_ss(U, A, B, C) \ + ((__m128) __builtin_ia32_reducess_mask ((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), \ + (__mmask8)(U))) + +#define _mm_reduce_round_ss(A, B, C, R) \ + ((__m128) __builtin_ia32_reducess_round ((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), (__mmask8)(U), (int)(R))) + +#define _mm_mask_reduce_round_ss(W, U, A, B, C, R) \ + ((__m128) __builtin_ia32_reducess_mask_round ((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), (__v4sf)(__m128)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_reduce_round_ss(U, A, B, C, R) \ + ((__m128) __builtin_ia32_reducesd_mask_round ((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), (__v4sf) _mm_setzero_ps (), \ + (__mmask8)(U), (int)(R))) + + +#endif + +#ifdef __DISABLE_AVX512DQ__ +#undef __DISABLE_AVX512DQ__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512DQ__ */ + +#endif /* _AVX512DQINTRIN_H_INCLUDED */ diff --git a/include-gcc/avx512erintrin.h b/include-gcc/avx512erintrin.h new file mode 100644 index 0000000..bd83b7f --- /dev/null +++ b/include-gcc/avx512erintrin.h @@ -0,0 +1,536 @@ +/* Copyright (C) 2013-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512ERINTRIN_H_INCLUDED +#define _AVX512ERINTRIN_H_INCLUDED + +#ifndef __AVX512ER__ +#pragma GCC push_options +#pragma GCC target("avx512er") +#define __DISABLE_AVX512ER__ +#endif /* __AVX512ER__ */ + +/* Internal data types for implementing the intrinsics. */ +typedef double __v8df __attribute__ ((__vector_size__ (64))); +typedef float __v16sf __attribute__ ((__vector_size__ (64))); + +/* The Intel API is flexible enough that we must allow aliasing with other + vector types, and their scalar components. */ +typedef float __m512 __attribute__ ((__vector_size__ (64), __may_alias__)); +typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__)); + +typedef unsigned char __mmask8; +typedef unsigned short __mmask16; + +#ifdef __OPTIMIZE__ +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_exp2a23_round_pd (__m512d __A, int __R) +{ + return (__m512d) __builtin_ia32_exp2pd_mask ((__v8df) __A, + (__v8df) _mm512_undefined_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_exp2a23_round_pd (__m512d __W, __mmask8 __U, __m512d __A, int __R) +{ + return (__m512d) __builtin_ia32_exp2pd_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_exp2a23_round_pd (__mmask8 __U, __m512d __A, int __R) +{ + return (__m512d) __builtin_ia32_exp2pd_mask ((__v8df) __A, + (__v8df) _mm512_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_exp2a23_round_ps (__m512 __A, int __R) +{ + return (__m512) __builtin_ia32_exp2ps_mask ((__v16sf) __A, + (__v16sf) _mm512_undefined_ps (), + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_exp2a23_round_ps (__m512 __W, __mmask16 __U, __m512 __A, int __R) +{ + return (__m512) __builtin_ia32_exp2ps_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_exp2a23_round_ps (__mmask16 __U, __m512 __A, int __R) +{ + return (__m512) __builtin_ia32_exp2ps_mask ((__v16sf) __A, + (__v16sf) _mm512_setzero_ps (), + (__mmask16) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rcp28_round_pd (__m512d __A, int __R) +{ + return (__m512d) __builtin_ia32_rcp28pd_mask ((__v8df) __A, + (__v8df) _mm512_undefined_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rcp28_round_pd (__m512d __W, __mmask8 __U, __m512d __A, int __R) +{ + return (__m512d) __builtin_ia32_rcp28pd_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rcp28_round_pd (__mmask8 __U, __m512d __A, int __R) +{ + return (__m512d) __builtin_ia32_rcp28pd_mask ((__v8df) __A, + (__v8df) _mm512_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rcp28_round_ps (__m512 __A, int __R) +{ + return (__m512) __builtin_ia32_rcp28ps_mask ((__v16sf) __A, + (__v16sf) _mm512_undefined_ps (), + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rcp28_round_ps (__m512 __W, __mmask16 __U, __m512 __A, int __R) +{ + return (__m512) __builtin_ia32_rcp28ps_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rcp28_round_ps (__mmask16 __U, __m512 __A, int __R) +{ + return (__m512) __builtin_ia32_rcp28ps_mask ((__v16sf) __A, + (__v16sf) _mm512_setzero_ps (), + (__mmask16) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rcp28_round_sd (__m128d __A, __m128d __B, int __R) +{ + return (__m128d) __builtin_ia32_rcp28sd_round ((__v2df) __B, + (__v2df) __A, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rcp28_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, int __R) +{ + return (__m128d) __builtin_ia32_rcp28sd_mask_round ((__v2df) __B, + (__v2df) __A, + (__v2df) __W, + __U, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rcp28_round_sd (__mmask8 __U, __m128d __A, __m128d __B, int __R) +{ + return (__m128d) __builtin_ia32_rcp28sd_mask_round ((__v2df) __B, + (__v2df) __A, + (__v2df) + _mm_setzero_pd (), + __U, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rcp28_round_ss (__m128 __A, __m128 __B, int __R) +{ + return (__m128) __builtin_ia32_rcp28ss_round ((__v4sf) __B, + (__v4sf) __A, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rcp28_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, int __R) +{ + return (__m128) __builtin_ia32_rcp28ss_mask_round ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) __W, + __U, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rcp28_round_ss (__mmask8 __U, __m128 __A, __m128 __B, int __R) +{ + return (__m128) __builtin_ia32_rcp28ss_mask_round ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + __U, + __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rsqrt28_round_pd (__m512d __A, int __R) +{ + return (__m512d) __builtin_ia32_rsqrt28pd_mask ((__v8df) __A, + (__v8df) _mm512_undefined_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rsqrt28_round_pd (__m512d __W, __mmask8 __U, __m512d __A, int __R) +{ + return (__m512d) __builtin_ia32_rsqrt28pd_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rsqrt28_round_pd (__mmask8 __U, __m512d __A, int __R) +{ + return (__m512d) __builtin_ia32_rsqrt28pd_mask ((__v8df) __A, + (__v8df) _mm512_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rsqrt28_round_ps (__m512 __A, int __R) +{ + return (__m512) __builtin_ia32_rsqrt28ps_mask ((__v16sf) __A, + (__v16sf) _mm512_undefined_ps (), + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rsqrt28_round_ps (__m512 __W, __mmask16 __U, __m512 __A, int __R) +{ + return (__m512) __builtin_ia32_rsqrt28ps_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rsqrt28_round_ps (__mmask16 __U, __m512 __A, int __R) +{ + return (__m512) __builtin_ia32_rsqrt28ps_mask ((__v16sf) __A, + (__v16sf) _mm512_setzero_ps (), + (__mmask16) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rsqrt28_round_sd (__m128d __A, __m128d __B, int __R) +{ + return (__m128d) __builtin_ia32_rsqrt28sd_round ((__v2df) __B, + (__v2df) __A, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rsqrt28_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, int __R) +{ + return (__m128d) __builtin_ia32_rsqrt28sd_mask_round ((__v2df) __B, + (__v2df) __A, + (__v2df) __W, + __U, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rsqrt28_round_sd (__mmask8 __U, __m128d __A, __m128d __B, int __R) +{ + return (__m128d) __builtin_ia32_rsqrt28sd_mask_round ((__v2df) __B, + (__v2df) __A, + (__v2df) + _mm_setzero_pd (), + __U, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rsqrt28_round_ss (__m128 __A, __m128 __B, int __R) +{ + return (__m128) __builtin_ia32_rsqrt28ss_round ((__v4sf) __B, + (__v4sf) __A, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rsqrt28_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, int __R) +{ + return (__m128) __builtin_ia32_rsqrt28ss_mask_round ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) __W, + __U, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rsqrt28_round_ss (__mmask8 __U, __m128 __A, __m128 __B, int __R) +{ + return (__m128) __builtin_ia32_rsqrt28ss_mask_round ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + __U, + __R); +} + +#else +#define _mm512_exp2a23_round_pd(A, C) \ + __builtin_ia32_exp2pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C) + +#define _mm512_mask_exp2a23_round_pd(W, U, A, C) \ + __builtin_ia32_exp2pd_mask(A, W, U, C) + +#define _mm512_maskz_exp2a23_round_pd(U, A, C) \ + __builtin_ia32_exp2pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C) + +#define _mm512_exp2a23_round_ps(A, C) \ + __builtin_ia32_exp2ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C) + +#define _mm512_mask_exp2a23_round_ps(W, U, A, C) \ + __builtin_ia32_exp2ps_mask(A, W, U, C) + +#define _mm512_maskz_exp2a23_round_ps(U, A, C) \ + __builtin_ia32_exp2ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C) + +#define _mm512_rcp28_round_pd(A, C) \ + __builtin_ia32_rcp28pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C) + +#define _mm512_mask_rcp28_round_pd(W, U, A, C) \ + __builtin_ia32_rcp28pd_mask(A, W, U, C) + +#define _mm512_maskz_rcp28_round_pd(U, A, C) \ + __builtin_ia32_rcp28pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C) + +#define _mm512_rcp28_round_ps(A, C) \ + __builtin_ia32_rcp28ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C) + +#define _mm512_mask_rcp28_round_ps(W, U, A, C) \ + __builtin_ia32_rcp28ps_mask(A, W, U, C) + +#define _mm512_maskz_rcp28_round_ps(U, A, C) \ + __builtin_ia32_rcp28ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C) + +#define _mm512_rsqrt28_round_pd(A, C) \ + __builtin_ia32_rsqrt28pd_mask(A, (__v8df)_mm512_setzero_pd(), -1, C) + +#define _mm512_mask_rsqrt28_round_pd(W, U, A, C) \ + __builtin_ia32_rsqrt28pd_mask(A, W, U, C) + +#define _mm512_maskz_rsqrt28_round_pd(U, A, C) \ + __builtin_ia32_rsqrt28pd_mask(A, (__v8df)_mm512_setzero_pd(), U, C) + +#define _mm512_rsqrt28_round_ps(A, C) \ + __builtin_ia32_rsqrt28ps_mask(A, (__v16sf)_mm512_setzero_ps(), -1, C) + +#define _mm512_mask_rsqrt28_round_ps(W, U, A, C) \ + __builtin_ia32_rsqrt28ps_mask(A, W, U, C) + +#define _mm512_maskz_rsqrt28_round_ps(U, A, C) \ + __builtin_ia32_rsqrt28ps_mask(A, (__v16sf)_mm512_setzero_ps(), U, C) + +#define _mm_rcp28_round_sd(A, B, R) \ + __builtin_ia32_rcp28sd_round(A, B, R) + +#define _mm_mask_rcp28_round_sd(W, U, A, B, R) \ + __builtin_ia32_rcp28sd_mask_round ((A), (B), (W), (U), (R)) + +#define _mm_maskz_rcp28_round_sd(U, A, B, R) \ + __builtin_ia32_rcp28sd_mask_round ((A), (B), (__v2df) _mm_setzero_pd (), \ + (U), (R)) + +#define _mm_rcp28_round_ss(A, B, R) \ + __builtin_ia32_rcp28ss_round(A, B, R) + +#define _mm_mask_rcp28_round_ss(W, U, A, B, R) \ + __builtin_ia32_rcp28ss_mask_round ((A), (B), (W), (U), (R)) + +#define _mm_maskz_rcp28_round_ss(U, A, B, R) \ + __builtin_ia32_rcp28ss_mask_round ((A), (B), (__v4sf) _mm_setzero_ps (), \ + (U), (R)) + +#define _mm_rsqrt28_round_sd(A, B, R) \ + __builtin_ia32_rsqrt28sd_round(A, B, R) + +#define _mm_mask_rsqrt28_round_sd(W, U, A, B, R) \ + __builtin_ia32_rsqrt28sd_mask_round ((A), (B), (W), (U), (R)) + +#define _mm_maskz_rsqrt28_round_sd(U, A, B, R) \ + __builtin_ia32_rsqrt28sd_mask_round ((A), (B), (__v2df) _mm_setzero_pd (),\ + (U), (R)) + +#define _mm_rsqrt28_round_ss(A, B, R) \ + __builtin_ia32_rsqrt28ss_round(A, B, R) + +#define _mm_mask_rsqrt28_round_ss(W, U, A, B, R) \ + __builtin_ia32_rsqrt28ss_mask_round ((A), (B), (W), (U), (R)) + +#define _mm_maskz_rsqrt28_round_ss(U, A, B, R) \ + __builtin_ia32_rsqrt28ss_mask_round ((A), (B), (__v4sf) _mm_setzero_ps (),\ + (U), (R)) + +#endif + +#define _mm_mask_rcp28_sd(W, U, A, B)\ + _mm_mask_rcp28_round_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_rcp28_sd(U, A, B)\ + _mm_maskz_rcp28_round_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_rcp28_ss(W, U, A, B)\ + _mm_mask_rcp28_round_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_rcp28_ss(U, A, B)\ + _mm_maskz_rcp28_round_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_rsqrt28_sd(W, U, A, B)\ + _mm_mask_rsqrt28_round_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_rsqrt28_sd(U, A, B)\ + _mm_maskz_rsqrt28_round_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_rsqrt28_ss(W, U, A, B)\ + _mm_mask_rsqrt28_round_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_rsqrt28_ss(U, A, B)\ + _mm_maskz_rsqrt28_round_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_exp2a23_pd(A) \ + _mm512_exp2a23_round_pd(A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_exp2a23_pd(W, U, A) \ + _mm512_mask_exp2a23_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_exp2a23_pd(U, A) \ + _mm512_maskz_exp2a23_round_pd(U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_exp2a23_ps(A) \ + _mm512_exp2a23_round_ps(A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_exp2a23_ps(W, U, A) \ + _mm512_mask_exp2a23_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_exp2a23_ps(U, A) \ + _mm512_maskz_exp2a23_round_ps(U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_rcp28_pd(A) \ + _mm512_rcp28_round_pd(A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_rcp28_pd(W, U, A) \ + _mm512_mask_rcp28_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_rcp28_pd(U, A) \ + _mm512_maskz_rcp28_round_pd(U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_rcp28_ps(A) \ + _mm512_rcp28_round_ps(A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_rcp28_ps(W, U, A) \ + _mm512_mask_rcp28_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_rcp28_ps(U, A) \ + _mm512_maskz_rcp28_round_ps(U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_rsqrt28_pd(A) \ + _mm512_rsqrt28_round_pd(A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_rsqrt28_pd(W, U, A) \ + _mm512_mask_rsqrt28_round_pd(W, U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_rsqrt28_pd(U, A) \ + _mm512_maskz_rsqrt28_round_pd(U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_rsqrt28_ps(A) \ + _mm512_rsqrt28_round_ps(A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_rsqrt28_ps(W, U, A) \ + _mm512_mask_rsqrt28_round_ps(W, U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_rsqrt28_ps(U, A) \ + _mm512_maskz_rsqrt28_round_ps(U, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm_rcp28_sd(A, B) \ + __builtin_ia32_rcp28sd_round(B, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm_rcp28_ss(A, B) \ + __builtin_ia32_rcp28ss_round(B, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm_rsqrt28_sd(A, B) \ + __builtin_ia32_rsqrt28sd_round(B, A, _MM_FROUND_CUR_DIRECTION) + +#define _mm_rsqrt28_ss(A, B) \ + __builtin_ia32_rsqrt28ss_round(B, A, _MM_FROUND_CUR_DIRECTION) + +#ifdef __DISABLE_AVX512ER__ +#undef __DISABLE_AVX512ER__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512ER__ */ + +#endif /* _AVX512ERINTRIN_H_INCLUDED */ diff --git a/include-gcc/avx512fintrin.h b/include-gcc/avx512fintrin.h new file mode 100644 index 0000000..89b3219 --- /dev/null +++ b/include-gcc/avx512fintrin.h @@ -0,0 +1,16483 @@ +/* Copyright (C) 2013-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512FINTRIN_H_INCLUDED +#define _AVX512FINTRIN_H_INCLUDED + +#ifndef __AVX512F__ +#pragma GCC push_options +#pragma GCC target("avx512f") +#define __DISABLE_AVX512F__ +#endif /* __AVX512F__ */ + +/* Internal data types for implementing the intrinsics. */ +typedef double __v8df __attribute__ ((__vector_size__ (64))); +typedef float __v16sf __attribute__ ((__vector_size__ (64))); +typedef long long __v8di __attribute__ ((__vector_size__ (64))); +typedef unsigned long long __v8du __attribute__ ((__vector_size__ (64))); +typedef int __v16si __attribute__ ((__vector_size__ (64))); +typedef unsigned int __v16su __attribute__ ((__vector_size__ (64))); +typedef short __v32hi __attribute__ ((__vector_size__ (64))); +typedef unsigned short __v32hu __attribute__ ((__vector_size__ (64))); +typedef char __v64qi __attribute__ ((__vector_size__ (64))); +typedef unsigned char __v64qu __attribute__ ((__vector_size__ (64))); + +/* The Intel API is flexible enough that we must allow aliasing with other + vector types, and their scalar components. */ +typedef float __m512 __attribute__ ((__vector_size__ (64), __may_alias__)); +typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__)); +typedef double __m512d __attribute__ ((__vector_size__ (64), __may_alias__)); + +/* Unaligned version of the same type. */ +typedef float __m512_u __attribute__ ((__vector_size__ (64), __may_alias__, __aligned__ (1))); +typedef long long __m512i_u __attribute__ ((__vector_size__ (64), __may_alias__, __aligned__ (1))); +typedef double __m512d_u __attribute__ ((__vector_size__ (64), __may_alias__, __aligned__ (1))); + +typedef unsigned char __mmask8; +typedef unsigned short __mmask16; + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_int2mask (int __M) +{ + return (__mmask16) __M; +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask2int (__mmask16 __M) +{ + return (int) __M; +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set_epi64 (long long __A, long long __B, long long __C, + long long __D, long long __E, long long __F, + long long __G, long long __H) +{ + return __extension__ (__m512i) (__v8di) + { __H, __G, __F, __E, __D, __C, __B, __A }; +} + +/* Create the vector [A B C D E F G H I J K L M N O P]. */ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set_epi32 (int __A, int __B, int __C, int __D, + int __E, int __F, int __G, int __H, + int __I, int __J, int __K, int __L, + int __M, int __N, int __O, int __P) +{ + return __extension__ (__m512i)(__v16si) + { __P, __O, __N, __M, __L, __K, __J, __I, + __H, __G, __F, __E, __D, __C, __B, __A }; +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set_epi16 (short __q31, short __q30, short __q29, short __q28, + short __q27, short __q26, short __q25, short __q24, + short __q23, short __q22, short __q21, short __q20, + short __q19, short __q18, short __q17, short __q16, + short __q15, short __q14, short __q13, short __q12, + short __q11, short __q10, short __q09, short __q08, + short __q07, short __q06, short __q05, short __q04, + short __q03, short __q02, short __q01, short __q00) +{ + return __extension__ (__m512i)(__v32hi){ + __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, + __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15, + __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23, + __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31 + }; +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set_epi8 (char __q63, char __q62, char __q61, char __q60, + char __q59, char __q58, char __q57, char __q56, + char __q55, char __q54, char __q53, char __q52, + char __q51, char __q50, char __q49, char __q48, + char __q47, char __q46, char __q45, char __q44, + char __q43, char __q42, char __q41, char __q40, + char __q39, char __q38, char __q37, char __q36, + char __q35, char __q34, char __q33, char __q32, + char __q31, char __q30, char __q29, char __q28, + char __q27, char __q26, char __q25, char __q24, + char __q23, char __q22, char __q21, char __q20, + char __q19, char __q18, char __q17, char __q16, + char __q15, char __q14, char __q13, char __q12, + char __q11, char __q10, char __q09, char __q08, + char __q07, char __q06, char __q05, char __q04, + char __q03, char __q02, char __q01, char __q00) +{ + return __extension__ (__m512i)(__v64qi){ + __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, + __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15, + __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23, + __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31, + __q32, __q33, __q34, __q35, __q36, __q37, __q38, __q39, + __q40, __q41, __q42, __q43, __q44, __q45, __q46, __q47, + __q48, __q49, __q50, __q51, __q52, __q53, __q54, __q55, + __q56, __q57, __q58, __q59, __q60, __q61, __q62, __q63 + }; +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set_pd (double __A, double __B, double __C, double __D, + double __E, double __F, double __G, double __H) +{ + return __extension__ (__m512d) + { __H, __G, __F, __E, __D, __C, __B, __A }; +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set_ps (float __A, float __B, float __C, float __D, + float __E, float __F, float __G, float __H, + float __I, float __J, float __K, float __L, + float __M, float __N, float __O, float __P) +{ + return __extension__ (__m512) + { __P, __O, __N, __M, __L, __K, __J, __I, + __H, __G, __F, __E, __D, __C, __B, __A }; +} + +#define _mm512_setr_epi64(e0,e1,e2,e3,e4,e5,e6,e7) \ + _mm512_set_epi64(e7,e6,e5,e4,e3,e2,e1,e0) + +#define _mm512_setr_epi32(e0,e1,e2,e3,e4,e5,e6,e7, \ + e8,e9,e10,e11,e12,e13,e14,e15) \ + _mm512_set_epi32(e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0) + +#define _mm512_setr_pd(e0,e1,e2,e3,e4,e5,e6,e7) \ + _mm512_set_pd(e7,e6,e5,e4,e3,e2,e1,e0) + +#define _mm512_setr_ps(e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15) \ + _mm512_set_ps(e15,e14,e13,e12,e11,e10,e9,e8,e7,e6,e5,e4,e3,e2,e1,e0) + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_undefined_ps (void) +{ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Winit-self" + __m512 __Y = __Y; +#pragma GCC diagnostic pop + return __Y; +} + +#define _mm512_undefined _mm512_undefined_ps + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_undefined_pd (void) +{ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Winit-self" + __m512d __Y = __Y; +#pragma GCC diagnostic pop + return __Y; +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_undefined_epi32 (void) +{ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Winit-self" + __m512i __Y = __Y; +#pragma GCC diagnostic pop + return __Y; +} + +#define _mm512_undefined_si512 _mm512_undefined_epi32 + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set1_epi8 (char __A) +{ + return __extension__ (__m512i)(__v64qi) + { __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A }; +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set1_epi16 (short __A) +{ + return __extension__ (__m512i)(__v32hi) + { __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A }; +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set1_pd (double __A) +{ + return __extension__ (__m512d)(__v8df) + { __A, __A, __A, __A, __A, __A, __A, __A }; +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set1_ps (float __A) +{ + return __extension__ (__m512)(__v16sf) + { __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A }; +} + +/* Create the vector [A B C D A B C D A B C D A B C D]. */ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set4_epi32 (int __A, int __B, int __C, int __D) +{ + return __extension__ (__m512i)(__v16si) + { __D, __C, __B, __A, __D, __C, __B, __A, + __D, __C, __B, __A, __D, __C, __B, __A }; +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set4_epi64 (long long __A, long long __B, long long __C, + long long __D) +{ + return __extension__ (__m512i) (__v8di) + { __D, __C, __B, __A, __D, __C, __B, __A }; +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set4_pd (double __A, double __B, double __C, double __D) +{ + return __extension__ (__m512d) + { __D, __C, __B, __A, __D, __C, __B, __A }; +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set4_ps (float __A, float __B, float __C, float __D) +{ + return __extension__ (__m512) + { __D, __C, __B, __A, __D, __C, __B, __A, + __D, __C, __B, __A, __D, __C, __B, __A }; +} + +#define _mm512_setr4_epi64(e0,e1,e2,e3) \ + _mm512_set4_epi64(e3,e2,e1,e0) + +#define _mm512_setr4_epi32(e0,e1,e2,e3) \ + _mm512_set4_epi32(e3,e2,e1,e0) + +#define _mm512_setr4_pd(e0,e1,e2,e3) \ + _mm512_set4_pd(e3,e2,e1,e0) + +#define _mm512_setr4_ps(e0,e1,e2,e3) \ + _mm512_set4_ps(e3,e2,e1,e0) + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_setzero_ps (void) +{ + return __extension__ (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_setzero (void) +{ + return _mm512_setzero_ps (); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_setzero_pd (void) +{ + return __extension__ (__m512d) { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_setzero_epi32 (void) +{ + return __extension__ (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 }; +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_setzero_si512 (void) +{ + return __extension__ (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 }; +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_movapd512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mov_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_movapd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_movaps512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mov_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_movaps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_load_pd (void const *__P) +{ + return *(__m512d *) __P; +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_load_pd (__m512d __W, __mmask8 __U, void const *__P) +{ + return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_load_pd (__mmask8 __U, void const *__P) +{ + return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_store_pd (void *__P, __m512d __A) +{ + *(__m512d *) __P = __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_store_pd (void *__P, __mmask8 __U, __m512d __A) +{ + __builtin_ia32_storeapd512_mask ((__v8df *) __P, (__v8df) __A, + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_load_ps (void const *__P) +{ + return *(__m512 *) __P; +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_load_ps (__m512 __W, __mmask16 __U, void const *__P) +{ + return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_load_ps (__mmask16 __U, void const *__P) +{ + return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_store_ps (void *__P, __m512 __A) +{ + *(__m512 *) __P = __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_store_ps (void *__P, __mmask16 __U, __m512 __A) +{ + __builtin_ia32_storeaps512_mask ((__v16sf *) __P, (__v16sf) __A, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mov_epi64 (__m512i __W, __mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_movdqa64_512_mask ((__v8di) __A, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mov_epi64 (__mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_movdqa64_512_mask ((__v8di) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_load_epi64 (void const *__P) +{ + return *(__m512i *) __P; +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_load_epi64 (__m512i __W, __mmask8 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_load_epi64 (__mmask8 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_store_epi64 (void *__P, __m512i __A) +{ + *(__m512i *) __P = __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_store_epi64 (void *__P, __mmask8 __U, __m512i __A) +{ + __builtin_ia32_movdqa64store512_mask ((__v8di *) __P, (__v8di) __A, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mov_epi32 (__m512i __W, __mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_movdqa32_512_mask ((__v16si) __A, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mov_epi32 (__mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_movdqa32_512_mask ((__v16si) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_load_si512 (void const *__P) +{ + return *(__m512i *) __P; +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_load_epi32 (void const *__P) +{ + return *(__m512i *) __P; +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_load_epi32 (__mmask16 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_store_si512 (void *__P, __m512i __A) +{ + *(__m512i *) __P = __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_store_epi32 (void *__P, __m512i __A) +{ + *(__m512i *) __P = __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_store_epi32 (void *__P, __mmask16 __U, __m512i __A) +{ + __builtin_ia32_movdqa32store512_mask ((__v16si *) __P, (__v16si) __A, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mullo_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v16su) __A * (__v16su) __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mullo_epi32 (__mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mullo_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmulld512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mullox_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v8du) __A * (__v8du) __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mullox_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) +{ + return _mm512_mask_mov_epi64 (__W, __M, _mm512_mullox_epi64 (__A, __B)); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sllv_epi32 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X, + (__v16si) __Y, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sllv_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X, + (__v16si) __Y, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sllv_epi32 (__mmask16 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psllv16si_mask ((__v16si) __X, + (__v16si) __Y, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srav_epi32 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X, + (__v16si) __Y, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srav_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X, + (__v16si) __Y, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srav_epi32 (__mmask16 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrav16si_mask ((__v16si) __X, + (__v16si) __Y, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srlv_epi32 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X, + (__v16si) __Y, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srlv_epi32 (__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X, + (__v16si) __Y, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srlv_epi32 (__mmask16 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrlv16si_mask ((__v16si) __X, + (__v16si) __Y, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_add_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v8du) __A + (__v8du) __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_add_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_add_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sub_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v8du) __A - (__v8du) __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sub_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sub_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sllv_epi64 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) + _mm512_undefined_pd (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sllv_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sllv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psllv8di_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srav_epi64 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srav_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srav_epi64 (__mmask8 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrav8di_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srlv_epi64 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srlv_epi64 (__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srlv_epi64 (__mmask8 __U, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_psrlv8di_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_add_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v16su) __A + (__v16su) __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_add_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_paddd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mul_epi32 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X, + (__v16si) __Y, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mul_epi32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X, + (__v16si) __Y, + (__v8di) __W, __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mul_epi32 (__mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_pmuldq512_mask ((__v16si) __X, + (__v16si) __Y, + (__v8di) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sub_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v16su) __A - (__v16su) __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sub_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sub_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psubd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mul_epu32 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X, + (__v16si) __Y, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mul_epu32 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X, + (__v16si) __Y, + (__v8di) __W, __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mul_epu32 (__mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_pmuludq512_mask ((__v16si) __X, + (__v16si) __Y, + (__v8di) + _mm512_setzero_si512 (), + __M); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_slli_epi64 (__m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psllqi512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_slli_epi64 (__m512i __W, __mmask8 __U, __m512i __A, + unsigned int __B) +{ + return (__m512i) __builtin_ia32_psllqi512_mask ((__v8di) __A, __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_slli_epi64 (__mmask8 __U, __m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psllqi512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} +#else +#define _mm512_slli_epi64(X, C) \ + ((__m512i) __builtin_ia32_psllqi512_mask ((__v8di)(__m512i)(X), (int)(C),\ + (__v8di)(__m512i)_mm512_undefined_epi32 (),\ + (__mmask8)-1)) + +#define _mm512_mask_slli_epi64(W, U, X, C) \ + ((__m512i) __builtin_ia32_psllqi512_mask ((__v8di)(__m512i)(X), (int)(C),\ + (__v8di)(__m512i)(W),\ + (__mmask8)(U))) + +#define _mm512_maskz_slli_epi64(U, X, C) \ + ((__m512i) __builtin_ia32_psllqi512_mask ((__v8di)(__m512i)(X), (int)(C),\ + (__v8di)(__m512i)_mm512_setzero_si512 (),\ + (__mmask8)(U))) +#endif + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sll_epi64 (__m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A, + (__v2di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sll_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A, + (__v2di) __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sll_epi64 (__mmask8 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psllq512_mask ((__v8di) __A, + (__v2di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srli_epi64 (__m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psrlqi512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srli_epi64 (__m512i __W, __mmask8 __U, + __m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psrlqi512_mask ((__v8di) __A, __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srli_epi64 (__mmask8 __U, __m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psrlqi512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} +#else +#define _mm512_srli_epi64(X, C) \ + ((__m512i) __builtin_ia32_psrlqi512_mask ((__v8di)(__m512i)(X), (int)(C),\ + (__v8di)(__m512i)_mm512_undefined_epi32 (),\ + (__mmask8)-1)) + +#define _mm512_mask_srli_epi64(W, U, X, C) \ + ((__m512i) __builtin_ia32_psrlqi512_mask ((__v8di)(__m512i)(X), (int)(C),\ + (__v8di)(__m512i)(W),\ + (__mmask8)(U))) + +#define _mm512_maskz_srli_epi64(U, X, C) \ + ((__m512i) __builtin_ia32_psrlqi512_mask ((__v8di)(__m512i)(X), (int)(C),\ + (__v8di)(__m512i)_mm512_setzero_si512 (),\ + (__mmask8)(U))) +#endif + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srl_epi64 (__m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A, + (__v2di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srl_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A, + (__v2di) __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srl_epi64 (__mmask8 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psrlq512_mask ((__v8di) __A, + (__v2di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srai_epi64 (__m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psraqi512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srai_epi64 (__m512i __W, __mmask8 __U, __m512i __A, + unsigned int __B) +{ + return (__m512i) __builtin_ia32_psraqi512_mask ((__v8di) __A, __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srai_epi64 (__mmask8 __U, __m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psraqi512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} +#else +#define _mm512_srai_epi64(X, C) \ + ((__m512i) __builtin_ia32_psraqi512_mask ((__v8di)(__m512i)(X), (int)(C),\ + (__v8di)(__m512i)_mm512_undefined_epi32 (),\ + (__mmask8)-1)) + +#define _mm512_mask_srai_epi64(W, U, X, C) \ + ((__m512i) __builtin_ia32_psraqi512_mask ((__v8di)(__m512i)(X), (int)(C),\ + (__v8di)(__m512i)(W),\ + (__mmask8)(U))) + +#define _mm512_maskz_srai_epi64(U, X, C) \ + ((__m512i) __builtin_ia32_psraqi512_mask ((__v8di)(__m512i)(X), (int)(C),\ + (__v8di)(__m512i)_mm512_setzero_si512 (),\ + (__mmask8)(U))) +#endif + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sra_epi64 (__m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A, + (__v2di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sra_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A, + (__v2di) __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sra_epi64 (__mmask8 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psraq512_mask ((__v8di) __A, + (__v2di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_slli_epi32 (__m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_pslldi512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_slli_epi32 (__m512i __W, __mmask16 __U, __m512i __A, + unsigned int __B) +{ + return (__m512i) __builtin_ia32_pslldi512_mask ((__v16si) __A, __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_slli_epi32 (__mmask16 __U, __m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_pslldi512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} +#else +#define _mm512_slli_epi32(X, C) \ + ((__m512i) __builtin_ia32_pslldi512_mask ((__v16si)(__m512i)(X), (int)(C),\ + (__v16si)(__m512i)_mm512_undefined_epi32 (),\ + (__mmask16)-1)) + +#define _mm512_mask_slli_epi32(W, U, X, C) \ + ((__m512i) __builtin_ia32_pslldi512_mask ((__v16si)(__m512i)(X), (int)(C),\ + (__v16si)(__m512i)(W),\ + (__mmask16)(U))) + +#define _mm512_maskz_slli_epi32(U, X, C) \ + ((__m512i) __builtin_ia32_pslldi512_mask ((__v16si)(__m512i)(X), (int)(C),\ + (__v16si)(__m512i)_mm512_setzero_si512 (),\ + (__mmask16)(U))) +#endif + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sll_epi32 (__m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A, + (__v4si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sll_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A, + (__v4si) __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sll_epi32 (__mmask16 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_pslld512_mask ((__v16si) __A, + (__v4si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srli_epi32 (__m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psrldi512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srli_epi32 (__m512i __W, __mmask16 __U, + __m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psrldi512_mask ((__v16si) __A, __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srli_epi32 (__mmask16 __U, __m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psrldi512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} +#else +#define _mm512_srli_epi32(X, C) \ + ((__m512i) __builtin_ia32_psrldi512_mask ((__v16si)(__m512i)(X), (int)(C),\ + (__v16si)(__m512i)_mm512_undefined_epi32 (),\ + (__mmask16)-1)) + +#define _mm512_mask_srli_epi32(W, U, X, C) \ + ((__m512i) __builtin_ia32_psrldi512_mask ((__v16si)(__m512i)(X), (int)(C),\ + (__v16si)(__m512i)(W),\ + (__mmask16)(U))) + +#define _mm512_maskz_srli_epi32(U, X, C) \ + ((__m512i) __builtin_ia32_psrldi512_mask ((__v16si)(__m512i)(X), (int)(C),\ + (__v16si)(__m512i)_mm512_setzero_si512 (),\ + (__mmask16)(U))) +#endif + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srl_epi32 (__m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A, + (__v4si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srl_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A, + (__v4si) __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srl_epi32 (__mmask16 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psrld512_mask ((__v16si) __A, + (__v4si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_srai_epi32 (__m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psradi512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_srai_epi32 (__m512i __W, __mmask16 __U, __m512i __A, + unsigned int __B) +{ + return (__m512i) __builtin_ia32_psradi512_mask ((__v16si) __A, __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_srai_epi32 (__mmask16 __U, __m512i __A, unsigned int __B) +{ + return (__m512i) __builtin_ia32_psradi512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} +#else +#define _mm512_srai_epi32(X, C) \ + ((__m512i) __builtin_ia32_psradi512_mask ((__v16si)(__m512i)(X), (int)(C),\ + (__v16si)(__m512i)_mm512_undefined_epi32 (),\ + (__mmask16)-1)) + +#define _mm512_mask_srai_epi32(W, U, X, C) \ + ((__m512i) __builtin_ia32_psradi512_mask ((__v16si)(__m512i)(X), (int)(C),\ + (__v16si)(__m512i)(W),\ + (__mmask16)(U))) + +#define _mm512_maskz_srai_epi32(U, X, C) \ + ((__m512i) __builtin_ia32_psradi512_mask ((__v16si)(__m512i)(X), (int)(C),\ + (__v16si)(__m512i)_mm512_setzero_si512 (),\ + (__mmask16)(U))) +#endif + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sra_epi32 (__m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A, + (__v4si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sra_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A, + (__v4si) __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sra_epi32 (__mmask16 __U, __m512i __A, __m128i __B) +{ + return (__m512i) __builtin_ia32_psrad512_mask ((__v16si) __A, + (__v4si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_addsd_round ((__v2df) __A, + (__v2df) __B, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_addss_round ((__v4sf) __A, + (__v4sf) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_subsd_round ((__v2df) __A, + (__v2df) __B, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_subss_round ((__v4sf) __A, + (__v4sf) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); +} + +#else +#define _mm_add_round_sd(A, B, C) \ + (__m128d)__builtin_ia32_addsd_round(A, B, C) + +#define _mm_mask_add_round_sd(W, U, A, B, C) \ + (__m128d)__builtin_ia32_addsd_mask_round(A, B, W, U, C) + +#define _mm_maskz_add_round_sd(U, A, B, C) \ + (__m128d)__builtin_ia32_addsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) + +#define _mm_add_round_ss(A, B, C) \ + (__m128)__builtin_ia32_addss_round(A, B, C) + +#define _mm_mask_add_round_ss(W, U, A, B, C) \ + (__m128)__builtin_ia32_addss_mask_round(A, B, W, U, C) + +#define _mm_maskz_add_round_ss(U, A, B, C) \ + (__m128)__builtin_ia32_addss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) + +#define _mm_sub_round_sd(A, B, C) \ + (__m128d)__builtin_ia32_subsd_round(A, B, C) + +#define _mm_mask_sub_round_sd(W, U, A, B, C) \ + (__m128d)__builtin_ia32_subsd_mask_round(A, B, W, U, C) + +#define _mm_maskz_sub_round_sd(U, A, B, C) \ + (__m128d)__builtin_ia32_subsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) + +#define _mm_sub_round_ss(A, B, C) \ + (__m128)__builtin_ia32_subss_round(A, B, C) + +#define _mm_mask_sub_round_ss(W, U, A, B, C) \ + (__m128)__builtin_ia32_subss_mask_round(A, B, W, U, C) + +#define _mm_maskz_sub_round_ss(U, A, B, C) \ + (__m128)__builtin_ia32_subss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) + +#endif + +/* Constant helper to represent the ternary logic operations among + vector A, B and C. */ +typedef enum +{ + _MM_TERNLOG_A = 0xF0, + _MM_TERNLOG_B = 0xCC, + _MM_TERNLOG_C = 0xAA +} _MM_TERNLOG_ENUM; + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_ternarylogic_epi64 (__m512i __A, __m512i __B, __m512i __C, + const int __imm) +{ + return (__m512i) + __builtin_ia32_pternlogq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __C, + (unsigned char) __imm, + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_ternarylogic_epi64 (__m512i __A, __mmask8 __U, __m512i __B, + __m512i __C, const int __imm) +{ + return (__m512i) + __builtin_ia32_pternlogq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __C, + (unsigned char) __imm, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_ternarylogic_epi64 (__mmask8 __U, __m512i __A, __m512i __B, + __m512i __C, const int __imm) +{ + return (__m512i) + __builtin_ia32_pternlogq512_maskz ((__v8di) __A, + (__v8di) __B, + (__v8di) __C, + (unsigned char) __imm, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_ternarylogic_epi32 (__m512i __A, __m512i __B, __m512i __C, + const int __imm) +{ + return (__m512i) + __builtin_ia32_pternlogd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __C, + (unsigned char) __imm, + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_ternarylogic_epi32 (__m512i __A, __mmask16 __U, __m512i __B, + __m512i __C, const int __imm) +{ + return (__m512i) + __builtin_ia32_pternlogd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __C, + (unsigned char) __imm, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_ternarylogic_epi32 (__mmask16 __U, __m512i __A, __m512i __B, + __m512i __C, const int __imm) +{ + return (__m512i) + __builtin_ia32_pternlogd512_maskz ((__v16si) __A, + (__v16si) __B, + (__v16si) __C, + (unsigned char) __imm, + (__mmask16) __U); +} +#else +#define _mm512_ternarylogic_epi64(A, B, C, I) \ + ((__m512i) \ + __builtin_ia32_pternlogq512_mask ((__v8di) (__m512i) (A), \ + (__v8di) (__m512i) (B), \ + (__v8di) (__m512i) (C), \ + (unsigned char) (I), \ + (__mmask8) -1)) +#define _mm512_mask_ternarylogic_epi64(A, U, B, C, I) \ + ((__m512i) \ + __builtin_ia32_pternlogq512_mask ((__v8di) (__m512i) (A), \ + (__v8di) (__m512i) (B), \ + (__v8di) (__m512i) (C), \ + (unsigned char)(I), \ + (__mmask8) (U))) +#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, I) \ + ((__m512i) \ + __builtin_ia32_pternlogq512_maskz ((__v8di) (__m512i) (A), \ + (__v8di) (__m512i) (B), \ + (__v8di) (__m512i) (C), \ + (unsigned char) (I), \ + (__mmask8) (U))) +#define _mm512_ternarylogic_epi32(A, B, C, I) \ + ((__m512i) \ + __builtin_ia32_pternlogd512_mask ((__v16si) (__m512i) (A), \ + (__v16si) (__m512i) (B), \ + (__v16si) (__m512i) (C), \ + (unsigned char) (I), \ + (__mmask16) -1)) +#define _mm512_mask_ternarylogic_epi32(A, U, B, C, I) \ + ((__m512i) \ + __builtin_ia32_pternlogd512_mask ((__v16si) (__m512i) (A), \ + (__v16si) (__m512i) (B), \ + (__v16si) (__m512i) (C), \ + (unsigned char) (I), \ + (__mmask16) (U))) +#define _mm512_maskz_ternarylogic_epi32(U, A, B, C, I) \ + ((__m512i) \ + __builtin_ia32_pternlogd512_maskz ((__v16si) (__m512i) (A), \ + (__v16si) (__m512i) (B), \ + (__v16si) (__m512i) (C), \ + (unsigned char) (I), \ + (__mmask16) (U))) +#endif + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rcp14_pd (__m512d __A) +{ + return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rcp14_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rcp14_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rcp14_ps (__m512 __A) +{ + return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rcp14_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rcp14_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_rcp14sd ((__v2df) __B, + (__v2df) __A); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rcp14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __B, + (__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rcp14_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __B, + (__v2df) __A, + (__v2df) _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rcp14_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_rcp14ss ((__v4sf) __B, + (__v4sf) __A); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rcp14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rcp14_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rsqrt14_pd (__m512d __A) +{ + return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rsqrt14_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rsqrt14_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rsqrt14_ps (__m512 __A) +{ + return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rsqrt14_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rsqrt14_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_rsqrt14sd ((__v2df) __B, + (__v2df) __A); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rsqrt14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __B, + (__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rsqrt14_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __B, + (__v2df) __A, + (__v2df) _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rsqrt14_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_rsqrt14ss ((__v4sf) __B, + (__v4sf) __A); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rsqrt14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rsqrt14_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) _mm_setzero_ps (), + (__mmask8) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sqrt_round_pd (__m512d __A, const int __R) +{ + return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sqrt_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + const int __R) +{ + return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sqrt_round_pd (__mmask8 __U, __m512d __A, const int __R) +{ + return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sqrt_round_ps (__m512 __A, const int __R) +{ + return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sqrt_round_ps (__m512 __W, __mmask16 __U, __m512 __A, const int __R) +{ + return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sqrt_round_ps (__mmask16 __U, __m512 __A, const int __R) +{ + return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sqrt_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_sqrtsd_mask_round ((__v2df) __B, + (__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sqrt_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_sqrtsd_mask_round ((__v2df) __B, + (__v2df) __A, + (__v2df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sqrt_round_sd (__mmask8 __U, __m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_sqrtsd_mask_round ((__v2df) __B, + (__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sqrt_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_sqrtss_mask_round ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sqrt_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_sqrtss_mask_round ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sqrt_round_ss (__mmask8 __U, __m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_sqrtss_mask_round ((__v4sf) __B, + (__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); +} +#else +#define _mm512_sqrt_round_pd(A, C) \ + (__m512d)__builtin_ia32_sqrtpd512_mask(A, (__v8df)_mm512_undefined_pd(), -1, C) + +#define _mm512_mask_sqrt_round_pd(W, U, A, C) \ + (__m512d)__builtin_ia32_sqrtpd512_mask(A, W, U, C) + +#define _mm512_maskz_sqrt_round_pd(U, A, C) \ + (__m512d)__builtin_ia32_sqrtpd512_mask(A, (__v8df)_mm512_setzero_pd(), U, C) + +#define _mm512_sqrt_round_ps(A, C) \ + (__m512)__builtin_ia32_sqrtps512_mask(A, (__v16sf)_mm512_undefined_ps(), -1, C) + +#define _mm512_mask_sqrt_round_ps(W, U, A, C) \ + (__m512)__builtin_ia32_sqrtps512_mask(A, W, U, C) + +#define _mm512_maskz_sqrt_round_ps(U, A, C) \ + (__m512)__builtin_ia32_sqrtps512_mask(A, (__v16sf)_mm512_setzero_ps(), U, C) + +#define _mm_sqrt_round_sd(A, B, C) \ + (__m128d)__builtin_ia32_sqrtsd_mask_round (B, A, \ + (__v2df) _mm_setzero_pd (), -1, C) + +#define _mm_mask_sqrt_round_sd(W, U, A, B, C) \ + (__m128d)__builtin_ia32_sqrtsd_mask_round (B, A, W, U, C) + +#define _mm_maskz_sqrt_round_sd(U, A, B, C) \ + (__m128d)__builtin_ia32_sqrtsd_mask_round (B, A, \ + (__v2df) _mm_setzero_pd (), U, C) + +#define _mm_sqrt_round_ss(A, B, C) \ + (__m128)__builtin_ia32_sqrtss_mask_round (B, A, \ + (__v4sf) _mm_setzero_ps (), -1, C) + +#define _mm_mask_sqrt_round_ss(W, U, A, B, C) \ + (__m128)__builtin_ia32_sqrtss_mask_round (B, A, W, U, C) + +#define _mm_maskz_sqrt_round_ss(U, A, B, C) \ + (__m128)__builtin_ia32_sqrtss_mask_round (B, A, \ + (__v4sf) _mm_setzero_ps (), U, C) +#endif + +#define _mm_mask_sqrt_sd(W, U, A, B) \ + _mm_mask_sqrt_round_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_sqrt_sd(U, A, B) \ + _mm_maskz_sqrt_round_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_sqrt_ss(W, U, A, B) \ + _mm_mask_sqrt_round_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_sqrt_ss(U, A, B) \ + _mm_maskz_sqrt_round_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi8_epi32 (__m128i __A) +{ + return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi8_epi32 (__m512i __W, __mmask16 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi8_epi32 (__mmask16 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovsxbd512_mask ((__v16qi) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi8_epi64 (__m128i __A) +{ + return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi8_epi64 (__m512i __W, __mmask8 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovsxbq512_mask ((__v16qi) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi16_epi32 (__m256i __A) +{ + return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi16_epi32 (__m512i __W, __mmask16 __U, __m256i __A) +{ + return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi16_epi32 (__mmask16 __U, __m256i __A) +{ + return (__m512i) __builtin_ia32_pmovsxwd512_mask ((__v16hi) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi16_epi64 (__m128i __A) +{ + return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi16_epi64 (__m512i __W, __mmask8 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovsxwq512_mask ((__v8hi) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi32_epi64 (__m256i __X) +{ + return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi32_epi64 (__m512i __W, __mmask8 __U, __m256i __X) +{ + return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi32_epi64 (__mmask8 __U, __m256i __X) +{ + return (__m512i) __builtin_ia32_pmovsxdq512_mask ((__v8si) __X, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepu8_epi32 (__m128i __A) +{ + return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu8_epi32 (__m512i __W, __mmask16 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu8_epi32 (__mmask16 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovzxbd512_mask ((__v16qi) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepu8_epi64 (__m128i __A) +{ + return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu8_epi64 (__m512i __W, __mmask8 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovzxbq512_mask ((__v16qi) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepu16_epi32 (__m256i __A) +{ + return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu16_epi32 (__m512i __W, __mmask16 __U, __m256i __A) +{ + return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu16_epi32 (__mmask16 __U, __m256i __A) +{ + return (__m512i) __builtin_ia32_pmovzxwd512_mask ((__v16hi) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepu16_epi64 (__m128i __A) +{ + return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu16_epi64 (__m512i __W, __mmask8 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m512i) __builtin_ia32_pmovzxwq512_mask ((__v8hi) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepu32_epi64 (__m256i __X) +{ + return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu32_epi64 (__m512i __W, __mmask8 __U, __m256i __X) +{ + return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu32_epi64 (__mmask8 __U, __m256i __X) +{ + return (__m512i) __builtin_ia32_pmovzxdq512_mask ((__v8si) __X, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_add_round_pd (__m512d __A, __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_add_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_add_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + const int __R) +{ + return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_add_round_ps (__m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_add_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_add_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sub_round_pd (__m512d __A, __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sub_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sub_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + const int __R) +{ + return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sub_round_ps (__m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sub_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sub_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); +} +#else +#define _mm512_add_round_pd(A, B, C) \ + (__m512d)__builtin_ia32_addpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C) + +#define _mm512_mask_add_round_pd(W, U, A, B, C) \ + (__m512d)__builtin_ia32_addpd512_mask(A, B, W, U, C) + +#define _mm512_maskz_add_round_pd(U, A, B, C) \ + (__m512d)__builtin_ia32_addpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C) + +#define _mm512_add_round_ps(A, B, C) \ + (__m512)__builtin_ia32_addps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C) + +#define _mm512_mask_add_round_ps(W, U, A, B, C) \ + (__m512)__builtin_ia32_addps512_mask(A, B, W, U, C) + +#define _mm512_maskz_add_round_ps(U, A, B, C) \ + (__m512)__builtin_ia32_addps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C) + +#define _mm512_sub_round_pd(A, B, C) \ + (__m512d)__builtin_ia32_subpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C) + +#define _mm512_mask_sub_round_pd(W, U, A, B, C) \ + (__m512d)__builtin_ia32_subpd512_mask(A, B, W, U, C) + +#define _mm512_maskz_sub_round_pd(U, A, B, C) \ + (__m512d)__builtin_ia32_subpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C) + +#define _mm512_sub_round_ps(A, B, C) \ + (__m512)__builtin_ia32_subps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C) + +#define _mm512_mask_sub_round_ps(W, U, A, B, C) \ + (__m512)__builtin_ia32_subps512_mask(A, B, W, U, C) + +#define _mm512_maskz_sub_round_ps(U, A, B, C) \ + (__m512)__builtin_ia32_subps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mul_round_pd (__m512d __A, __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mul_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mul_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + const int __R) +{ + return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mul_round_ps (__m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mul_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mul_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_div_round_pd (__m512d __M, __m512d __V, const int __R) +{ + return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M, + (__v8df) __V, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_div_round_pd (__m512d __W, __mmask8 __U, __m512d __M, + __m512d __V, const int __R) +{ + return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M, + (__v8df) __V, + (__v8df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_div_round_pd (__mmask8 __U, __m512d __M, __m512d __V, + const int __R) +{ + return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M, + (__v8df) __V, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_div_round_ps (__m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_div_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_div_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_mulsd_round ((__v2df) __A, + (__v2df) __B, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mul_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mul_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_mulss_round ((__v4sf) __A, + (__v4sf) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mul_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mul_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_div_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_divsd_round ((__v2df) __A, + (__v2df) __B, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_div_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_div_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_div_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_divss_round ((__v4sf) __A, + (__v4sf) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_div_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_div_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); +} + +#else +#define _mm512_mul_round_pd(A, B, C) \ + (__m512d)__builtin_ia32_mulpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C) + +#define _mm512_mask_mul_round_pd(W, U, A, B, C) \ + (__m512d)__builtin_ia32_mulpd512_mask(A, B, W, U, C) + +#define _mm512_maskz_mul_round_pd(U, A, B, C) \ + (__m512d)__builtin_ia32_mulpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C) + +#define _mm512_mul_round_ps(A, B, C) \ + (__m512)__builtin_ia32_mulps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C) + +#define _mm512_mask_mul_round_ps(W, U, A, B, C) \ + (__m512)__builtin_ia32_mulps512_mask(A, B, W, U, C) + +#define _mm512_maskz_mul_round_ps(U, A, B, C) \ + (__m512)__builtin_ia32_mulps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C) + +#define _mm512_div_round_pd(A, B, C) \ + (__m512d)__builtin_ia32_divpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, C) + +#define _mm512_mask_div_round_pd(W, U, A, B, C) \ + (__m512d)__builtin_ia32_divpd512_mask(A, B, W, U, C) + +#define _mm512_maskz_div_round_pd(U, A, B, C) \ + (__m512d)__builtin_ia32_divpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, C) + +#define _mm512_div_round_ps(A, B, C) \ + (__m512)__builtin_ia32_divps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, C) + +#define _mm512_mask_div_round_ps(W, U, A, B, C) \ + (__m512)__builtin_ia32_divps512_mask(A, B, W, U, C) + +#define _mm512_maskz_div_round_ps(U, A, B, C) \ + (__m512)__builtin_ia32_divps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, C) + +#define _mm_mul_round_sd(A, B, C) \ + (__m128d)__builtin_ia32_mulsd_round(A, B, C) + +#define _mm_mask_mul_round_sd(W, U, A, B, C) \ + (__m128d)__builtin_ia32_mulsd_mask_round(A, B, W, U, C) + +#define _mm_maskz_mul_round_sd(U, A, B, C) \ + (__m128d)__builtin_ia32_mulsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) + +#define _mm_mul_round_ss(A, B, C) \ + (__m128)__builtin_ia32_mulss_round(A, B, C) + +#define _mm_mask_mul_round_ss(W, U, A, B, C) \ + (__m128)__builtin_ia32_mulss_mask_round(A, B, W, U, C) + +#define _mm_maskz_mul_round_ss(U, A, B, C) \ + (__m128)__builtin_ia32_mulss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) + +#define _mm_div_round_sd(A, B, C) \ + (__m128d)__builtin_ia32_divsd_round(A, B, C) + +#define _mm_mask_div_round_sd(W, U, A, B, C) \ + (__m128d)__builtin_ia32_divsd_mask_round(A, B, W, U, C) + +#define _mm_maskz_div_round_sd(U, A, B, C) \ + (__m128d)__builtin_ia32_divsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) + +#define _mm_div_round_ss(A, B, C) \ + (__m128)__builtin_ia32_divss_round(A, B, C) + +#define _mm_mask_div_round_ss(W, U, A, B, C) \ + (__m128)__builtin_ia32_divss_mask_round(A, B, W, U, C) + +#define _mm_maskz_div_round_ss(U, A, B, C) \ + (__m128)__builtin_ia32_divss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) + +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_round_pd (__m512d __A, __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + const int __R) +{ + return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_round_ps (__m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_round_pd (__m512d __A, __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + const int __R) +{ + return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_round_ps (__m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_round_ps (__mmask16 __U, __m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); +} +#else +#define _mm512_max_round_pd(A, B, R) \ + (__m512d)__builtin_ia32_maxpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, R) + +#define _mm512_mask_max_round_pd(W, U, A, B, R) \ + (__m512d)__builtin_ia32_maxpd512_mask(A, B, W, U, R) + +#define _mm512_maskz_max_round_pd(U, A, B, R) \ + (__m512d)__builtin_ia32_maxpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, R) + +#define _mm512_max_round_ps(A, B, R) \ + (__m512)__builtin_ia32_maxps512_mask(A, B, (__v16sf)_mm512_undefined_pd(), -1, R) + +#define _mm512_mask_max_round_ps(W, U, A, B, R) \ + (__m512)__builtin_ia32_maxps512_mask(A, B, W, U, R) + +#define _mm512_maskz_max_round_ps(U, A, B, R) \ + (__m512)__builtin_ia32_maxps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, R) + +#define _mm512_min_round_pd(A, B, R) \ + (__m512d)__builtin_ia32_minpd512_mask(A, B, (__v8df)_mm512_undefined_pd(), -1, R) + +#define _mm512_mask_min_round_pd(W, U, A, B, R) \ + (__m512d)__builtin_ia32_minpd512_mask(A, B, W, U, R) + +#define _mm512_maskz_min_round_pd(U, A, B, R) \ + (__m512d)__builtin_ia32_minpd512_mask(A, B, (__v8df)_mm512_setzero_pd(), U, R) + +#define _mm512_min_round_ps(A, B, R) \ + (__m512)__builtin_ia32_minps512_mask(A, B, (__v16sf)_mm512_undefined_ps(), -1, R) + +#define _mm512_mask_min_round_ps(W, U, A, B, R) \ + (__m512)__builtin_ia32_minps512_mask(A, B, W, U, R) + +#define _mm512_maskz_min_round_ps(U, A, B, R) \ + (__m512)__builtin_ia32_minps512_mask(A, B, (__v16sf)_mm512_setzero_ps(), U, R) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_scalef_round_pd (__m512d __A, __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_scalef_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __R) +{ + return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_scalef_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + const int __R) +{ + return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_scalef_round_ps (__m512 __A, __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_scalef_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __R) +{ + return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_scalef_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + const int __R) +{ + return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_scalef_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_scalef_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_scalef_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_scalef_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_scalef_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_scalef_round_ss (__mmask8 __U, __m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); +} +#else +#define _mm512_scalef_round_pd(A, B, C) \ + ((__m512d) \ + __builtin_ia32_scalefpd512_mask((A), (B), \ + (__v8df) _mm512_undefined_pd(), \ + -1, (C))) + +#define _mm512_mask_scalef_round_pd(W, U, A, B, C) \ + ((__m512d) __builtin_ia32_scalefpd512_mask((A), (B), (W), (U), (C))) + +#define _mm512_maskz_scalef_round_pd(U, A, B, C) \ + ((__m512d) \ + __builtin_ia32_scalefpd512_mask((A), (B), \ + (__v8df) _mm512_setzero_pd(), \ + (U), (C))) + +#define _mm512_scalef_round_ps(A, B, C) \ + ((__m512) \ + __builtin_ia32_scalefps512_mask((A), (B), \ + (__v16sf) _mm512_undefined_ps(), \ + -1, (C))) + +#define _mm512_mask_scalef_round_ps(W, U, A, B, C) \ + ((__m512) __builtin_ia32_scalefps512_mask((A), (B), (W), (U), (C))) + +#define _mm512_maskz_scalef_round_ps(U, A, B, C) \ + ((__m512) \ + __builtin_ia32_scalefps512_mask((A), (B), \ + (__v16sf) _mm512_setzero_ps(), \ + (U), (C))) + +#define _mm_scalef_round_sd(A, B, C) \ + ((__m128d) \ + __builtin_ia32_scalefsd_mask_round ((A), (B), \ + (__v2df) _mm_undefined_pd (), \ + -1, (C))) + +#define _mm_scalef_round_ss(A, B, C) \ + ((__m128) \ + __builtin_ia32_scalefss_mask_round ((A), (B), \ + (__v4sf) _mm_undefined_ps (), \ + -1, (C))) + +#define _mm_mask_scalef_round_sd(W, U, A, B, C) \ + ((__m128d) \ + __builtin_ia32_scalefsd_mask_round ((A), (B), (W), (U), (C))) + +#define _mm_mask_scalef_round_ss(W, U, A, B, C) \ + ((__m128) \ + __builtin_ia32_scalefss_mask_round ((A), (B), (W), (U), (C))) + +#define _mm_maskz_scalef_round_sd(U, A, B, C) \ + ((__m128d) \ + __builtin_ia32_scalefsd_mask_round ((A), (B), \ + (__v2df) _mm_setzero_pd (), \ + (U), (C))) + +#define _mm_maskz_scalef_round_ss(U, A, B, C) \ + ((__m128) \ + __builtin_ia32_scalefss_mask_round ((A), (B), \ + (__v4sf) _mm_setzero_ps (), \ + (U), (C))) +#endif + +#define _mm_mask_scalef_sd(W, U, A, B) \ + _mm_mask_scalef_round_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_scalef_sd(U, A, B) \ + _mm_maskz_scalef_round_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_scalef_ss(W, U, A, B) \ + _mm_mask_scalef_round_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_scalef_ss(U, A, B) \ + _mm_maskz_scalef_round_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#ifdef __OPTIMIZE__ +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmadd_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmadd_round_pd (__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmadd_round_pd (__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmadd_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmadd_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmadd_round_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmadd_round_ps (__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmadd_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmsub_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmsub_round_pd (__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmsub_round_pd (__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U, const int __R) +{ + return (__m512d) __builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmsub_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmsubpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmsub_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmsub_round_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmsub_round_ps (__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) +{ + return (__m512) __builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmsub_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmsubps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmaddsub_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmaddsub_round_pd (__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmaddsub_round_pd (__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmaddsub_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmaddsub_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmaddsub_round_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmaddsub_round_ps (__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmaddsub_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmsubadd_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmsubadd_round_pd (__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmsubadd_round_pd (__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U, const int __R) +{ + return (__m512d) __builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmsubadd_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmsubadd_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmsubadd_round_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmsubadd_round_ps (__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) +{ + return (__m512) __builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmsubadd_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fnmadd_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fnmadd_round_pd (__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fnmadd_round_pd (__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U, const int __R) +{ + return (__m512d) __builtin_ia32_vfnmaddpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fnmadd_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfnmaddpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fnmadd_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fnmadd_round_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fnmadd_round_ps (__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) +{ + return (__m512) __builtin_ia32_vfnmaddps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fnmadd_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfnmaddps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fnmsub_round_pd (__m512d __A, __m512d __B, __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fnmsub_round_pd (__m512d __A, __mmask8 __U, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fnmsub_round_pd (__m512d __A, __m512d __B, __m512d __C, + __mmask8 __U, const int __R) +{ + return (__m512d) __builtin_ia32_vfnmsubpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fnmsub_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + __m512d __C, const int __R) +{ + return (__m512d) __builtin_ia32_vfnmsubpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fnmsub_round_ps (__m512 __A, __m512 __B, __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fnmsub_round_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fnmsub_round_ps (__m512 __A, __m512 __B, __m512 __C, + __mmask16 __U, const int __R) +{ + return (__m512) __builtin_ia32_vfnmsubps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fnmsub_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + __m512 __C, const int __R) +{ + return (__m512) __builtin_ia32_vfnmsubps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, __R); +} +#else +#define _mm512_fmadd_round_pd(A, B, C, R) \ + (__m512d)__builtin_ia32_vfmaddpd512_mask(A, B, C, -1, R) + +#define _mm512_mask_fmadd_round_pd(A, U, B, C, R) \ + (__m512d)__builtin_ia32_vfmaddpd512_mask(A, B, C, U, R) + +#define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) \ + (__m512d)__builtin_ia32_vfmaddpd512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) \ + (__m512d)__builtin_ia32_vfmaddpd512_maskz(A, B, C, U, R) + +#define _mm512_fmadd_round_ps(A, B, C, R) \ + (__m512)__builtin_ia32_vfmaddps512_mask(A, B, C, -1, R) + +#define _mm512_mask_fmadd_round_ps(A, U, B, C, R) \ + (__m512)__builtin_ia32_vfmaddps512_mask(A, B, C, U, R) + +#define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) \ + (__m512)__builtin_ia32_vfmaddps512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) \ + (__m512)__builtin_ia32_vfmaddps512_maskz(A, B, C, U, R) + +#define _mm512_fmsub_round_pd(A, B, C, R) \ + (__m512d)__builtin_ia32_vfmsubpd512_mask(A, B, C, -1, R) + +#define _mm512_mask_fmsub_round_pd(A, U, B, C, R) \ + (__m512d)__builtin_ia32_vfmsubpd512_mask(A, B, C, U, R) + +#define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) \ + (__m512d)__builtin_ia32_vfmsubpd512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) \ + (__m512d)__builtin_ia32_vfmsubpd512_maskz(A, B, C, U, R) + +#define _mm512_fmsub_round_ps(A, B, C, R) \ + (__m512)__builtin_ia32_vfmsubps512_mask(A, B, C, -1, R) + +#define _mm512_mask_fmsub_round_ps(A, U, B, C, R) \ + (__m512)__builtin_ia32_vfmsubps512_mask(A, B, C, U, R) + +#define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) \ + (__m512)__builtin_ia32_vfmsubps512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) \ + (__m512)__builtin_ia32_vfmsubps512_maskz(A, B, C, U, R) + +#define _mm512_fmaddsub_round_pd(A, B, C, R) \ + (__m512d)__builtin_ia32_vfmaddsubpd512_mask(A, B, C, -1, R) + +#define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) \ + (__m512d)__builtin_ia32_vfmaddsubpd512_mask(A, B, C, U, R) + +#define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) \ + (__m512d)__builtin_ia32_vfmaddsubpd512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) \ + (__m512d)__builtin_ia32_vfmaddsubpd512_maskz(A, B, C, U, R) + +#define _mm512_fmaddsub_round_ps(A, B, C, R) \ + (__m512)__builtin_ia32_vfmaddsubps512_mask(A, B, C, -1, R) + +#define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) \ + (__m512)__builtin_ia32_vfmaddsubps512_mask(A, B, C, U, R) + +#define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) \ + (__m512)__builtin_ia32_vfmaddsubps512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) \ + (__m512)__builtin_ia32_vfmaddsubps512_maskz(A, B, C, U, R) + +#define _mm512_fmsubadd_round_pd(A, B, C, R) \ + (__m512d)__builtin_ia32_vfmaddsubpd512_mask(A, B, -(C), -1, R) + +#define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) \ + (__m512d)__builtin_ia32_vfmaddsubpd512_mask(A, B, -(C), U, R) + +#define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) \ + (__m512d)__builtin_ia32_vfmsubaddpd512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) \ + (__m512d)__builtin_ia32_vfmaddsubpd512_maskz(A, B, -(C), U, R) + +#define _mm512_fmsubadd_round_ps(A, B, C, R) \ + (__m512)__builtin_ia32_vfmaddsubps512_mask(A, B, -(C), -1, R) + +#define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) \ + (__m512)__builtin_ia32_vfmaddsubps512_mask(A, B, -(C), U, R) + +#define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) \ + (__m512)__builtin_ia32_vfmsubaddps512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) \ + (__m512)__builtin_ia32_vfmaddsubps512_maskz(A, B, -(C), U, R) + +#define _mm512_fnmadd_round_pd(A, B, C, R) \ + (__m512d)__builtin_ia32_vfnmaddpd512_mask(A, B, C, -1, R) + +#define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) \ + (__m512d)__builtin_ia32_vfnmaddpd512_mask(A, B, C, U, R) + +#define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) \ + (__m512d)__builtin_ia32_vfnmaddpd512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) \ + (__m512d)__builtin_ia32_vfnmaddpd512_maskz(A, B, C, U, R) + +#define _mm512_fnmadd_round_ps(A, B, C, R) \ + (__m512)__builtin_ia32_vfnmaddps512_mask(A, B, C, -1, R) + +#define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) \ + (__m512)__builtin_ia32_vfnmaddps512_mask(A, B, C, U, R) + +#define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) \ + (__m512)__builtin_ia32_vfnmaddps512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) \ + (__m512)__builtin_ia32_vfnmaddps512_maskz(A, B, C, U, R) + +#define _mm512_fnmsub_round_pd(A, B, C, R) \ + (__m512d)__builtin_ia32_vfnmsubpd512_mask(A, B, C, -1, R) + +#define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) \ + (__m512d)__builtin_ia32_vfnmsubpd512_mask(A, B, C, U, R) + +#define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) \ + (__m512d)__builtin_ia32_vfnmsubpd512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) \ + (__m512d)__builtin_ia32_vfnmsubpd512_maskz(A, B, C, U, R) + +#define _mm512_fnmsub_round_ps(A, B, C, R) \ + (__m512)__builtin_ia32_vfnmsubps512_mask(A, B, C, -1, R) + +#define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) \ + (__m512)__builtin_ia32_vfnmsubps512_mask(A, B, C, U, R) + +#define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) \ + (__m512)__builtin_ia32_vfnmsubps512_mask3(A, B, C, U, R) + +#define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) \ + (__m512)__builtin_ia32_vfnmsubps512_maskz(A, B, C, U, R) +#endif + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_abs_epi64 (__m512i __A) +{ + return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_abs_epi32 (__m512i __A) +{ + return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcastss_ps (__m128 __A) +{ + return (__m512) __builtin_ia32_broadcastss512 ((__v4sf) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A) +{ + return (__m512) __builtin_ia32_broadcastss512 ((__v4sf) __A, + (__v16sf) __O, __M); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A) +{ + return (__m512) __builtin_ia32_broadcastss512 ((__v4sf) __A, + (__v16sf) + _mm512_setzero_ps (), + __M); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcastsd_pd (__m128d __A) +{ + return (__m512d) __builtin_ia32_broadcastsd512 ((__v2df) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A) +{ + return (__m512d) __builtin_ia32_broadcastsd512 ((__v2df) __A, + (__v8df) __O, __M); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A) +{ + return (__m512d) __builtin_ia32_broadcastsd512 ((__v2df) __A, + (__v8df) + _mm512_setzero_pd (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcastd_epi32 (__m128i __A) +{ + return (__m512i) __builtin_ia32_pbroadcastd512 ((__v4si) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_pbroadcastd512 ((__v4si) __A, + (__v16si) __O, __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_pbroadcastd512 ((__v4si) __A, + (__v16si) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set1_epi32 (int __A) +{ + return (__m512i)(__v16si) + { __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A }; +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A) +{ + return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A, (__v16si) __O, + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_set1_epi32 (__mmask16 __M, int __A) +{ + return (__m512i) + __builtin_ia32_pbroadcastd512_gpr_mask (__A, + (__v16si) _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcastq_epi64 (__m128i __A) +{ + return (__m512i) __builtin_ia32_pbroadcastq512 ((__v2di) __A, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_pbroadcastq512 ((__v2di) __A, + (__v8di) __O, __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_pbroadcastq512 ((__v2di) __A, + (__v8di) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set1_epi64 (long long __A) +{ + return (__m512i)(__v8di) { __A, __A, __A, __A, __A, __A, __A, __A }; +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A) +{ + return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A, (__v8di) __O, + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_set1_epi64 (__mmask8 __M, long long __A) +{ + return (__m512i) + __builtin_ia32_pbroadcastq512_gpr_mask (__A, + (__v8di) _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcast_f32x4 (__m128 __A) +{ + return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcast_f32x4 (__m512 __O, __mmask16 __M, __m128 __A) +{ + return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A, + (__v16sf) __O, + __M); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcast_f32x4 (__mmask16 __M, __m128 __A) +{ + return (__m512) __builtin_ia32_broadcastf32x4_512 ((__v4sf) __A, + (__v16sf) + _mm512_setzero_ps (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcast_i32x4 (__m128i __A) +{ + return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcast_i32x4 (__m512i __O, __mmask16 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A, + (__v16si) __O, + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcast_i32x4 (__mmask16 __M, __m128i __A) +{ + return (__m512i) __builtin_ia32_broadcasti32x4_512 ((__v4si) __A, + (__v16si) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcast_f64x4 (__m256d __A) +{ + return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcast_f64x4 (__m512d __O, __mmask8 __M, __m256d __A) +{ + return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A, + (__v8df) __O, + __M); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcast_f64x4 (__mmask8 __M, __m256d __A) +{ + return (__m512d) __builtin_ia32_broadcastf64x4_512 ((__v4df) __A, + (__v8df) + _mm512_setzero_pd (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_broadcast_i64x4 (__m256i __A) +{ + return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_broadcast_i64x4 (__m512i __O, __mmask8 __M, __m256i __A) +{ + return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A, + (__v8di) __O, + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_broadcast_i64x4 (__mmask8 __M, __m256i __A) +{ + return (__m512i) __builtin_ia32_broadcasti64x4_512 ((__v4di) __A, + (__v8di) + _mm512_setzero_si512 (), + __M); +} + +typedef enum +{ + _MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02, + _MM_PERM_AAAD = 0x03, _MM_PERM_AABA = 0x04, _MM_PERM_AABB = 0x05, + _MM_PERM_AABC = 0x06, _MM_PERM_AABD = 0x07, _MM_PERM_AACA = 0x08, + _MM_PERM_AACB = 0x09, _MM_PERM_AACC = 0x0A, _MM_PERM_AACD = 0x0B, + _MM_PERM_AADA = 0x0C, _MM_PERM_AADB = 0x0D, _MM_PERM_AADC = 0x0E, + _MM_PERM_AADD = 0x0F, _MM_PERM_ABAA = 0x10, _MM_PERM_ABAB = 0x11, + _MM_PERM_ABAC = 0x12, _MM_PERM_ABAD = 0x13, _MM_PERM_ABBA = 0x14, + _MM_PERM_ABBB = 0x15, _MM_PERM_ABBC = 0x16, _MM_PERM_ABBD = 0x17, + _MM_PERM_ABCA = 0x18, _MM_PERM_ABCB = 0x19, _MM_PERM_ABCC = 0x1A, + _MM_PERM_ABCD = 0x1B, _MM_PERM_ABDA = 0x1C, _MM_PERM_ABDB = 0x1D, + _MM_PERM_ABDC = 0x1E, _MM_PERM_ABDD = 0x1F, _MM_PERM_ACAA = 0x20, + _MM_PERM_ACAB = 0x21, _MM_PERM_ACAC = 0x22, _MM_PERM_ACAD = 0x23, + _MM_PERM_ACBA = 0x24, _MM_PERM_ACBB = 0x25, _MM_PERM_ACBC = 0x26, + _MM_PERM_ACBD = 0x27, _MM_PERM_ACCA = 0x28, _MM_PERM_ACCB = 0x29, + _MM_PERM_ACCC = 0x2A, _MM_PERM_ACCD = 0x2B, _MM_PERM_ACDA = 0x2C, + _MM_PERM_ACDB = 0x2D, _MM_PERM_ACDC = 0x2E, _MM_PERM_ACDD = 0x2F, + _MM_PERM_ADAA = 0x30, _MM_PERM_ADAB = 0x31, _MM_PERM_ADAC = 0x32, + _MM_PERM_ADAD = 0x33, _MM_PERM_ADBA = 0x34, _MM_PERM_ADBB = 0x35, + _MM_PERM_ADBC = 0x36, _MM_PERM_ADBD = 0x37, _MM_PERM_ADCA = 0x38, + _MM_PERM_ADCB = 0x39, _MM_PERM_ADCC = 0x3A, _MM_PERM_ADCD = 0x3B, + _MM_PERM_ADDA = 0x3C, _MM_PERM_ADDB = 0x3D, _MM_PERM_ADDC = 0x3E, + _MM_PERM_ADDD = 0x3F, _MM_PERM_BAAA = 0x40, _MM_PERM_BAAB = 0x41, + _MM_PERM_BAAC = 0x42, _MM_PERM_BAAD = 0x43, _MM_PERM_BABA = 0x44, + _MM_PERM_BABB = 0x45, _MM_PERM_BABC = 0x46, _MM_PERM_BABD = 0x47, + _MM_PERM_BACA = 0x48, _MM_PERM_BACB = 0x49, _MM_PERM_BACC = 0x4A, + _MM_PERM_BACD = 0x4B, _MM_PERM_BADA = 0x4C, _MM_PERM_BADB = 0x4D, + _MM_PERM_BADC = 0x4E, _MM_PERM_BADD = 0x4F, _MM_PERM_BBAA = 0x50, + _MM_PERM_BBAB = 0x51, _MM_PERM_BBAC = 0x52, _MM_PERM_BBAD = 0x53, + _MM_PERM_BBBA = 0x54, _MM_PERM_BBBB = 0x55, _MM_PERM_BBBC = 0x56, + _MM_PERM_BBBD = 0x57, _MM_PERM_BBCA = 0x58, _MM_PERM_BBCB = 0x59, + _MM_PERM_BBCC = 0x5A, _MM_PERM_BBCD = 0x5B, _MM_PERM_BBDA = 0x5C, + _MM_PERM_BBDB = 0x5D, _MM_PERM_BBDC = 0x5E, _MM_PERM_BBDD = 0x5F, + _MM_PERM_BCAA = 0x60, _MM_PERM_BCAB = 0x61, _MM_PERM_BCAC = 0x62, + _MM_PERM_BCAD = 0x63, _MM_PERM_BCBA = 0x64, _MM_PERM_BCBB = 0x65, + _MM_PERM_BCBC = 0x66, _MM_PERM_BCBD = 0x67, _MM_PERM_BCCA = 0x68, + _MM_PERM_BCCB = 0x69, _MM_PERM_BCCC = 0x6A, _MM_PERM_BCCD = 0x6B, + _MM_PERM_BCDA = 0x6C, _MM_PERM_BCDB = 0x6D, _MM_PERM_BCDC = 0x6E, + _MM_PERM_BCDD = 0x6F, _MM_PERM_BDAA = 0x70, _MM_PERM_BDAB = 0x71, + _MM_PERM_BDAC = 0x72, _MM_PERM_BDAD = 0x73, _MM_PERM_BDBA = 0x74, + _MM_PERM_BDBB = 0x75, _MM_PERM_BDBC = 0x76, _MM_PERM_BDBD = 0x77, + _MM_PERM_BDCA = 0x78, _MM_PERM_BDCB = 0x79, _MM_PERM_BDCC = 0x7A, + _MM_PERM_BDCD = 0x7B, _MM_PERM_BDDA = 0x7C, _MM_PERM_BDDB = 0x7D, + _MM_PERM_BDDC = 0x7E, _MM_PERM_BDDD = 0x7F, _MM_PERM_CAAA = 0x80, + _MM_PERM_CAAB = 0x81, _MM_PERM_CAAC = 0x82, _MM_PERM_CAAD = 0x83, + _MM_PERM_CABA = 0x84, _MM_PERM_CABB = 0x85, _MM_PERM_CABC = 0x86, + _MM_PERM_CABD = 0x87, _MM_PERM_CACA = 0x88, _MM_PERM_CACB = 0x89, + _MM_PERM_CACC = 0x8A, _MM_PERM_CACD = 0x8B, _MM_PERM_CADA = 0x8C, + _MM_PERM_CADB = 0x8D, _MM_PERM_CADC = 0x8E, _MM_PERM_CADD = 0x8F, + _MM_PERM_CBAA = 0x90, _MM_PERM_CBAB = 0x91, _MM_PERM_CBAC = 0x92, + _MM_PERM_CBAD = 0x93, _MM_PERM_CBBA = 0x94, _MM_PERM_CBBB = 0x95, + _MM_PERM_CBBC = 0x96, _MM_PERM_CBBD = 0x97, _MM_PERM_CBCA = 0x98, + _MM_PERM_CBCB = 0x99, _MM_PERM_CBCC = 0x9A, _MM_PERM_CBCD = 0x9B, + _MM_PERM_CBDA = 0x9C, _MM_PERM_CBDB = 0x9D, _MM_PERM_CBDC = 0x9E, + _MM_PERM_CBDD = 0x9F, _MM_PERM_CCAA = 0xA0, _MM_PERM_CCAB = 0xA1, + _MM_PERM_CCAC = 0xA2, _MM_PERM_CCAD = 0xA3, _MM_PERM_CCBA = 0xA4, + _MM_PERM_CCBB = 0xA5, _MM_PERM_CCBC = 0xA6, _MM_PERM_CCBD = 0xA7, + _MM_PERM_CCCA = 0xA8, _MM_PERM_CCCB = 0xA9, _MM_PERM_CCCC = 0xAA, + _MM_PERM_CCCD = 0xAB, _MM_PERM_CCDA = 0xAC, _MM_PERM_CCDB = 0xAD, + _MM_PERM_CCDC = 0xAE, _MM_PERM_CCDD = 0xAF, _MM_PERM_CDAA = 0xB0, + _MM_PERM_CDAB = 0xB1, _MM_PERM_CDAC = 0xB2, _MM_PERM_CDAD = 0xB3, + _MM_PERM_CDBA = 0xB4, _MM_PERM_CDBB = 0xB5, _MM_PERM_CDBC = 0xB6, + _MM_PERM_CDBD = 0xB7, _MM_PERM_CDCA = 0xB8, _MM_PERM_CDCB = 0xB9, + _MM_PERM_CDCC = 0xBA, _MM_PERM_CDCD = 0xBB, _MM_PERM_CDDA = 0xBC, + _MM_PERM_CDDB = 0xBD, _MM_PERM_CDDC = 0xBE, _MM_PERM_CDDD = 0xBF, + _MM_PERM_DAAA = 0xC0, _MM_PERM_DAAB = 0xC1, _MM_PERM_DAAC = 0xC2, + _MM_PERM_DAAD = 0xC3, _MM_PERM_DABA = 0xC4, _MM_PERM_DABB = 0xC5, + _MM_PERM_DABC = 0xC6, _MM_PERM_DABD = 0xC7, _MM_PERM_DACA = 0xC8, + _MM_PERM_DACB = 0xC9, _MM_PERM_DACC = 0xCA, _MM_PERM_DACD = 0xCB, + _MM_PERM_DADA = 0xCC, _MM_PERM_DADB = 0xCD, _MM_PERM_DADC = 0xCE, + _MM_PERM_DADD = 0xCF, _MM_PERM_DBAA = 0xD0, _MM_PERM_DBAB = 0xD1, + _MM_PERM_DBAC = 0xD2, _MM_PERM_DBAD = 0xD3, _MM_PERM_DBBA = 0xD4, + _MM_PERM_DBBB = 0xD5, _MM_PERM_DBBC = 0xD6, _MM_PERM_DBBD = 0xD7, + _MM_PERM_DBCA = 0xD8, _MM_PERM_DBCB = 0xD9, _MM_PERM_DBCC = 0xDA, + _MM_PERM_DBCD = 0xDB, _MM_PERM_DBDA = 0xDC, _MM_PERM_DBDB = 0xDD, + _MM_PERM_DBDC = 0xDE, _MM_PERM_DBDD = 0xDF, _MM_PERM_DCAA = 0xE0, + _MM_PERM_DCAB = 0xE1, _MM_PERM_DCAC = 0xE2, _MM_PERM_DCAD = 0xE3, + _MM_PERM_DCBA = 0xE4, _MM_PERM_DCBB = 0xE5, _MM_PERM_DCBC = 0xE6, + _MM_PERM_DCBD = 0xE7, _MM_PERM_DCCA = 0xE8, _MM_PERM_DCCB = 0xE9, + _MM_PERM_DCCC = 0xEA, _MM_PERM_DCCD = 0xEB, _MM_PERM_DCDA = 0xEC, + _MM_PERM_DCDB = 0xED, _MM_PERM_DCDC = 0xEE, _MM_PERM_DCDD = 0xEF, + _MM_PERM_DDAA = 0xF0, _MM_PERM_DDAB = 0xF1, _MM_PERM_DDAC = 0xF2, + _MM_PERM_DDAD = 0xF3, _MM_PERM_DDBA = 0xF4, _MM_PERM_DDBB = 0xF5, + _MM_PERM_DDBC = 0xF6, _MM_PERM_DDBD = 0xF7, _MM_PERM_DDCA = 0xF8, + _MM_PERM_DDCB = 0xF9, _MM_PERM_DDCC = 0xFA, _MM_PERM_DDCD = 0xFB, + _MM_PERM_DDDA = 0xFC, _MM_PERM_DDDB = 0xFD, _MM_PERM_DDDC = 0xFE, + _MM_PERM_DDDD = 0xFF +} _MM_PERM_ENUM; + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shuffle_epi32 (__m512i __A, _MM_PERM_ENUM __mask) +{ + return (__m512i) __builtin_ia32_pshufd512_mask ((__v16si) __A, + __mask, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shuffle_epi32 (__m512i __W, __mmask16 __U, __m512i __A, + _MM_PERM_ENUM __mask) +{ + return (__m512i) __builtin_ia32_pshufd512_mask ((__v16si) __A, + __mask, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shuffle_epi32 (__mmask16 __U, __m512i __A, _MM_PERM_ENUM __mask) +{ + return (__m512i) __builtin_ia32_pshufd512_mask ((__v16si) __A, + __mask, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shuffle_i64x2 (__m512i __A, __m512i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di) __A, + (__v8di) __B, __imm, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shuffle_i64x2 (__m512i __W, __mmask8 __U, __m512i __A, + __m512i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di) __A, + (__v8di) __B, __imm, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shuffle_i64x2 (__mmask8 __U, __m512i __A, __m512i __B, + const int __imm) +{ + return (__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di) __A, + (__v8di) __B, __imm, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shuffle_i32x4 (__m512i __A, __m512i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si) __A, + (__v16si) __B, + __imm, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shuffle_i32x4 (__m512i __W, __mmask16 __U, __m512i __A, + __m512i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si) __A, + (__v16si) __B, + __imm, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shuffle_i32x4 (__mmask16 __U, __m512i __A, __m512i __B, + const int __imm) +{ + return (__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si) __A, + (__v16si) __B, + __imm, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shuffle_f64x2 (__m512d __A, __m512d __B, const int __imm) +{ + return (__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df) __A, + (__v8df) __B, __imm, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shuffle_f64x2 (__m512d __W, __mmask8 __U, __m512d __A, + __m512d __B, const int __imm) +{ + return (__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df) __A, + (__v8df) __B, __imm, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shuffle_f64x2 (__mmask8 __U, __m512d __A, __m512d __B, + const int __imm) +{ + return (__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df) __A, + (__v8df) __B, __imm, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shuffle_f32x4 (__m512 __A, __m512 __B, const int __imm) +{ + return (__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf) __A, + (__v16sf) __B, __imm, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shuffle_f32x4 (__m512 __W, __mmask16 __U, __m512 __A, + __m512 __B, const int __imm) +{ + return (__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf) __A, + (__v16sf) __B, __imm, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shuffle_f32x4 (__mmask16 __U, __m512 __A, __m512 __B, + const int __imm) +{ + return (__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf) __A, + (__v16sf) __B, __imm, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +#else +#define _mm512_shuffle_epi32(X, C) \ + ((__m512i) __builtin_ia32_pshufd512_mask ((__v16si)(__m512i)(X), (int)(C),\ + (__v16si)(__m512i)_mm512_undefined_epi32 (),\ + (__mmask16)-1)) + +#define _mm512_mask_shuffle_epi32(W, U, X, C) \ + ((__m512i) __builtin_ia32_pshufd512_mask ((__v16si)(__m512i)(X), (int)(C),\ + (__v16si)(__m512i)(W),\ + (__mmask16)(U))) + +#define _mm512_maskz_shuffle_epi32(U, X, C) \ + ((__m512i) __builtin_ia32_pshufd512_mask ((__v16si)(__m512i)(X), (int)(C),\ + (__v16si)(__m512i)_mm512_setzero_si512 (),\ + (__mmask16)(U))) + +#define _mm512_shuffle_i64x2(X, Y, C) \ + ((__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(X), \ + (__v8di)(__m512i)(Y), (int)(C),\ + (__v8di)(__m512i)_mm512_undefined_epi32 (),\ + (__mmask8)-1)) + +#define _mm512_mask_shuffle_i64x2(W, U, X, Y, C) \ + ((__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(X), \ + (__v8di)(__m512i)(Y), (int)(C),\ + (__v8di)(__m512i)(W),\ + (__mmask8)(U))) + +#define _mm512_maskz_shuffle_i64x2(U, X, Y, C) \ + ((__m512i) __builtin_ia32_shuf_i64x2_mask ((__v8di)(__m512i)(X), \ + (__v8di)(__m512i)(Y), (int)(C),\ + (__v8di)(__m512i)_mm512_setzero_si512 (),\ + (__mmask8)(U))) + +#define _mm512_shuffle_i32x4(X, Y, C) \ + ((__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si)(__m512i)(X), \ + (__v16si)(__m512i)(Y), (int)(C),\ + (__v16si)(__m512i)_mm512_undefined_epi32 (),\ + (__mmask16)-1)) + +#define _mm512_mask_shuffle_i32x4(W, U, X, Y, C) \ + ((__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si)(__m512i)(X), \ + (__v16si)(__m512i)(Y), (int)(C),\ + (__v16si)(__m512i)(W),\ + (__mmask16)(U))) + +#define _mm512_maskz_shuffle_i32x4(U, X, Y, C) \ + ((__m512i) __builtin_ia32_shuf_i32x4_mask ((__v16si)(__m512i)(X), \ + (__v16si)(__m512i)(Y), (int)(C),\ + (__v16si)(__m512i)_mm512_setzero_si512 (),\ + (__mmask16)(U))) + +#define _mm512_shuffle_f64x2(X, Y, C) \ + ((__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df)(__m512d)(X), \ + (__v8df)(__m512d)(Y), (int)(C),\ + (__v8df)(__m512d)_mm512_undefined_pd(),\ + (__mmask8)-1)) + +#define _mm512_mask_shuffle_f64x2(W, U, X, Y, C) \ + ((__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df)(__m512d)(X), \ + (__v8df)(__m512d)(Y), (int)(C),\ + (__v8df)(__m512d)(W),\ + (__mmask8)(U))) + +#define _mm512_maskz_shuffle_f64x2(U, X, Y, C) \ + ((__m512d) __builtin_ia32_shuf_f64x2_mask ((__v8df)(__m512d)(X), \ + (__v8df)(__m512d)(Y), (int)(C),\ + (__v8df)(__m512d)_mm512_setzero_pd(),\ + (__mmask8)(U))) + +#define _mm512_shuffle_f32x4(X, Y, C) \ + ((__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf)(__m512)(X), \ + (__v16sf)(__m512)(Y), (int)(C),\ + (__v16sf)(__m512)_mm512_undefined_ps(),\ + (__mmask16)-1)) + +#define _mm512_mask_shuffle_f32x4(W, U, X, Y, C) \ + ((__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf)(__m512)(X), \ + (__v16sf)(__m512)(Y), (int)(C),\ + (__v16sf)(__m512)(W),\ + (__mmask16)(U))) + +#define _mm512_maskz_shuffle_f32x4(U, X, Y, C) \ + ((__m512) __builtin_ia32_shuf_f32x4_mask ((__v16sf)(__m512)(X), \ + (__v16sf)(__m512)(Y), (int)(C),\ + (__v16sf)(__m512)_mm512_setzero_ps(),\ + (__mmask16)(U))) +#endif + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rolv_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rolv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rolv_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rorv_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rorv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rorv_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rolv_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rolv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rorv_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rorv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtt_roundpd_epi32 (__m512d __A, const int __R) +{ + return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1, __R); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtt_roundpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A, + const int __R) +{ + return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, __R); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtt_roundpd_epi32 (__mmask8 __U, __m512d __A, const int __R) +{ + return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, __R); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtt_roundpd_epu32 (__m512d __A, const int __R) +{ + return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1, __R); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtt_roundpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A, + const int __R) +{ + return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, __R); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtt_roundpd_epu32 (__mmask8 __U, __m512d __A, const int __R) +{ + return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, __R); +} +#else +#define _mm512_cvtt_roundpd_epi32(A, B) \ + ((__m256i)__builtin_ia32_cvttpd2dq512_mask(A, (__v8si)_mm256_undefined_si256(), -1, B)) + +#define _mm512_mask_cvtt_roundpd_epi32(W, U, A, B) \ + ((__m256i)__builtin_ia32_cvttpd2dq512_mask(A, (__v8si)(W), U, B)) + +#define _mm512_maskz_cvtt_roundpd_epi32(U, A, B) \ + ((__m256i)__builtin_ia32_cvttpd2dq512_mask(A, (__v8si)_mm256_setzero_si256(), U, B)) + +#define _mm512_cvtt_roundpd_epu32(A, B) \ + ((__m256i)__builtin_ia32_cvttpd2udq512_mask(A, (__v8si)_mm256_undefined_si256(), -1, B)) + +#define _mm512_mask_cvtt_roundpd_epu32(W, U, A, B) \ + ((__m256i)__builtin_ia32_cvttpd2udq512_mask(A, (__v8si)(W), U, B)) + +#define _mm512_maskz_cvtt_roundpd_epu32(U, A, B) \ + ((__m256i)__builtin_ia32_cvttpd2udq512_mask(A, (__v8si)_mm256_setzero_si256(), U, B)) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundpd_epi32 (__m512d __A, const int __R) +{ + return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1, __R); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A, + const int __R) +{ + return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, __R); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundpd_epi32 (__mmask8 __U, __m512d __A, const int __R) +{ + return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, __R); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundpd_epu32 (__m512d __A, const int __R) +{ + return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1, __R); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A, + const int __R) +{ + return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, __R); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundpd_epu32 (__mmask8 __U, __m512d __A, const int __R) +{ + return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, __R); +} +#else +#define _mm512_cvt_roundpd_epi32(A, B) \ + ((__m256i)__builtin_ia32_cvtpd2dq512_mask(A, (__v8si)_mm256_undefined_si256(), -1, B)) + +#define _mm512_mask_cvt_roundpd_epi32(W, U, A, B) \ + ((__m256i)__builtin_ia32_cvtpd2dq512_mask(A, (__v8si)(W), U, B)) + +#define _mm512_maskz_cvt_roundpd_epi32(U, A, B) \ + ((__m256i)__builtin_ia32_cvtpd2dq512_mask(A, (__v8si)_mm256_setzero_si256(), U, B)) + +#define _mm512_cvt_roundpd_epu32(A, B) \ + ((__m256i)__builtin_ia32_cvtpd2udq512_mask(A, (__v8si)_mm256_undefined_si256(), -1, B)) + +#define _mm512_mask_cvt_roundpd_epu32(W, U, A, B) \ + ((__m256i)__builtin_ia32_cvtpd2udq512_mask(A, (__v8si)(W), U, B)) + +#define _mm512_maskz_cvt_roundpd_epu32(U, A, B) \ + ((__m256i)__builtin_ia32_cvtpd2udq512_mask(A, (__v8si)_mm256_setzero_si256(), U, B)) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtt_roundps_epi32 (__m512 __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1, __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtt_roundps_epi32 (__m512i __W, __mmask16 __U, __m512 __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtt_roundps_epi32 (__mmask16 __U, __m512 __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtt_roundps_epu32 (__m512 __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1, __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtt_roundps_epu32 (__m512i __W, __mmask16 __U, __m512 __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtt_roundps_epu32 (__mmask16 __U, __m512 __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, __R); +} +#else +#define _mm512_cvtt_roundps_epi32(A, B) \ + ((__m512i)__builtin_ia32_cvttps2dq512_mask(A, (__v16si)_mm512_undefined_epi32 (), -1, B)) + +#define _mm512_mask_cvtt_roundps_epi32(W, U, A, B) \ + ((__m512i)__builtin_ia32_cvttps2dq512_mask(A, (__v16si)(W), U, B)) + +#define _mm512_maskz_cvtt_roundps_epi32(U, A, B) \ + ((__m512i)__builtin_ia32_cvttps2dq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B)) + +#define _mm512_cvtt_roundps_epu32(A, B) \ + ((__m512i)__builtin_ia32_cvttps2udq512_mask(A, (__v16si)_mm512_undefined_epi32 (), -1, B)) + +#define _mm512_mask_cvtt_roundps_epu32(W, U, A, B) \ + ((__m512i)__builtin_ia32_cvttps2udq512_mask(A, (__v16si)(W), U, B)) + +#define _mm512_maskz_cvtt_roundps_epu32(U, A, B) \ + ((__m512i)__builtin_ia32_cvttps2udq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B)) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundps_epi32 (__m512 __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1, __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundps_epi32 (__m512i __W, __mmask16 __U, __m512 __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundps_epi32 (__mmask16 __U, __m512 __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundps_epu32 (__m512 __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1, __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundps_epu32 (__m512i __W, __mmask16 __U, __m512 __A, + const int __R) +{ + return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundps_epu32 (__mmask16 __U, __m512 __A, const int __R) +{ + return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, __R); +} +#else +#define _mm512_cvt_roundps_epi32(A, B) \ + ((__m512i)__builtin_ia32_cvtps2dq512_mask(A, (__v16si)_mm512_undefined_epi32 (), -1, B)) + +#define _mm512_mask_cvt_roundps_epi32(W, U, A, B) \ + ((__m512i)__builtin_ia32_cvtps2dq512_mask(A, (__v16si)(W), U, B)) + +#define _mm512_maskz_cvt_roundps_epi32(U, A, B) \ + ((__m512i)__builtin_ia32_cvtps2dq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B)) + +#define _mm512_cvt_roundps_epu32(A, B) \ + ((__m512i)__builtin_ia32_cvtps2udq512_mask(A, (__v16si)_mm512_undefined_epi32 (), -1, B)) + +#define _mm512_mask_cvt_roundps_epu32(W, U, A, B) \ + ((__m512i)__builtin_ia32_cvtps2udq512_mask(A, (__v16si)(W), U, B)) + +#define _mm512_maskz_cvt_roundps_epu32(U, A, B) \ + ((__m512i)__builtin_ia32_cvtps2udq512_mask(A, (__v16si)_mm512_setzero_si512 (), U, B)) +#endif + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtu32_sd (__m128d __A, unsigned __B) +{ + return (__m128d) __builtin_ia32_cvtusi2sd32 ((__v2df) __A, __B); +} + +#ifdef __x86_64__ +#ifdef __OPTIMIZE__ +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundu64_sd (__m128d __A, unsigned long long __B, const int __R) +{ + return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundi64_sd (__m128d __A, long long __B, const int __R) +{ + return (__m128d) __builtin_ia32_cvtsi2sd64 ((__v2df) __A, __B, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsi64_sd (__m128d __A, long long __B, const int __R) +{ + return (__m128d) __builtin_ia32_cvtsi2sd64 ((__v2df) __A, __B, __R); +} +#else +#define _mm_cvt_roundu64_sd(A, B, C) \ + (__m128d)__builtin_ia32_cvtusi2sd64(A, B, C) + +#define _mm_cvt_roundi64_sd(A, B, C) \ + (__m128d)__builtin_ia32_cvtsi2sd64(A, B, C) + +#define _mm_cvt_roundsi64_sd(A, B, C) \ + (__m128d)__builtin_ia32_cvtsi2sd64(A, B, C) +#endif + +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundu32_ss (__m128 __A, unsigned __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtusi2ss32 ((__v4sf) __A, __B, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsi32_ss (__m128 __A, int __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtsi2ss32 ((__v4sf) __A, __B, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundi32_ss (__m128 __A, int __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtsi2ss32 ((__v4sf) __A, __B, __R); +} +#else +#define _mm_cvt_roundu32_ss(A, B, C) \ + (__m128)__builtin_ia32_cvtusi2ss32(A, B, C) + +#define _mm_cvt_roundi32_ss(A, B, C) \ + (__m128)__builtin_ia32_cvtsi2ss32(A, B, C) + +#define _mm_cvt_roundsi32_ss(A, B, C) \ + (__m128)__builtin_ia32_cvtsi2ss32(A, B, C) +#endif + +#ifdef __x86_64__ +#ifdef __OPTIMIZE__ +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundu64_ss (__m128 __A, unsigned long long __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsi64_ss (__m128 __A, long long __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtsi2ss64 ((__v4sf) __A, __B, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundi64_ss (__m128 __A, long long __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtsi2ss64 ((__v4sf) __A, __B, __R); +} +#else +#define _mm_cvt_roundu64_ss(A, B, C) \ + (__m128)__builtin_ia32_cvtusi2ss64(A, B, C) + +#define _mm_cvt_roundi64_ss(A, B, C) \ + (__m128)__builtin_ia32_cvtsi2ss64(A, B, C) + +#define _mm_cvt_roundsi64_ss(A, B, C) \ + (__m128)__builtin_ia32_cvtsi2ss64(A, B, C) +#endif + +#endif + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi32_epi8 (__m512i __A) +{ + return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask16) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) +{ + __builtin_ia32_pmovdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, + (__v16qi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtsepi32_epi8 (__m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask16) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) +{ + __builtin_ia32_pmovsdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtsepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, + (__v16qi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtsepi32_epi8 (__mmask16 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtusepi32_epi8 (__m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask16) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) +{ + __builtin_ia32_pmovusdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtusepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, + (__v16qi) __O, + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtusepi32_epi8 (__mmask16 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi32_epi16 (__m512i __A) +{ + return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, + (__v16hi) + _mm256_undefined_si256 (), + (__mmask16) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi32_storeu_epi16 (void * __P, __mmask16 __M, __m512i __A) +{ + __builtin_ia32_pmovdw512mem_mask ((__v16hi *) __P, (__v16si) __A, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, + (__v16hi) __O, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, + (__v16hi) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtsepi32_epi16 (__m512i __A) +{ + return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, + (__v16hi) + _mm256_undefined_si256 (), + (__mmask16) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtsepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A) +{ + __builtin_ia32_pmovsdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtsepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, + (__v16hi) __O, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtsepi32_epi16 (__mmask16 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, + (__v16hi) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtusepi32_epi16 (__m512i __A) +{ + return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, + (__v16hi) + _mm256_undefined_si256 (), + (__mmask16) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtusepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A) +{ + __builtin_ia32_pmovusdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtusepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, + (__v16hi) __O, + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtusepi32_epi16 (__mmask16 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, + (__v16hi) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi64_epi32 (__m512i __A) +{ + return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A) +{ + __builtin_ia32_pmovqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, + (__v8si) __O, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, + (__v8si) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtsepi64_epi32 (__m512i __A) +{ + return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtsepi64_storeu_epi32 (void *__P, __mmask8 __M, __m512i __A) +{ + __builtin_ia32_pmovsqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtsepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, + (__v8si) __O, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtsepi64_epi32 (__mmask8 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, + (__v8si) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtusepi64_epi32 (__m512i __A) +{ + return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtusepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A) +{ + __builtin_ia32_pmovusqd512mem_mask ((__v8si*) __P, (__v8di) __A, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtusepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, + (__v8si) __O, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtusepi64_epi32 (__mmask8 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, + (__v8si) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi64_epi16 (__m512i __A) +{ + return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A) +{ + __builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, + (__v8hi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtsepi64_epi16 (__m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m512i __A) +{ + __builtin_ia32_pmovsqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, + (__v8hi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtsepi64_epi16 (__mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtusepi64_epi16 (__m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtusepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A) +{ + __builtin_ia32_pmovusqw512mem_mask ((__v8hi*) __P, (__v8di) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, + (__v8hi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtusepi64_epi16 (__mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi64_epi8 (__m512i __A) +{ + return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) +{ + __builtin_ia32_pmovqb512mem_mask ((unsigned long long *) __P, + (__v8di) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, + (__v16qi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi64_epi8 (__mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtsepi64_epi8 (__m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) +{ + __builtin_ia32_pmovsqb512mem_mask ((unsigned long long *) __P, (__v8di) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, + (__v16qi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtsepi64_epi8 (__mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtusepi64_epi8 (__m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) +{ + __builtin_ia32_pmovusqb512mem_mask ((unsigned long long *) __P, (__v8di) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, + (__v16qi) __O, + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtusepi64_epi8 (__mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi32_pd (__m256i __A) +{ + return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A) +{ + return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A) +{ + return (__m512d) __builtin_ia32_cvtdq2pd512_mask ((__v8si) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepu32_pd (__m256i __A) +{ + return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A) +{ + return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A) +{ + return (__m512d) __builtin_ia32_cvtudq2pd512_mask ((__v8si) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundepi32_ps (__m512i __A, const int __R) +{ + return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundepi32_ps (__m512 __W, __mmask16 __U, __m512i __A, + const int __R) +{ + return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A, + (__v16sf) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundepi32_ps (__mmask16 __U, __m512i __A, const int __R) +{ + return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundepu32_ps (__m512i __A, const int __R) +{ + return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundepu32_ps (__m512 __W, __mmask16 __U, __m512i __A, + const int __R) +{ + return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A, + (__v16sf) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundepu32_ps (__mmask16 __U, __m512i __A, const int __R) +{ + return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); +} + +#else +#define _mm512_cvt_roundepi32_ps(A, B) \ + (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(A), (__v16sf)_mm512_undefined_ps(), -1, B) + +#define _mm512_mask_cvt_roundepi32_ps(W, U, A, B) \ + (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(A), W, U, B) + +#define _mm512_maskz_cvt_roundepi32_ps(U, A, B) \ + (__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(A), (__v16sf)_mm512_setzero_ps(), U, B) + +#define _mm512_cvt_roundepu32_ps(A, B) \ + (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(A), (__v16sf)_mm512_undefined_ps(), -1, B) + +#define _mm512_mask_cvt_roundepu32_ps(W, U, A, B) \ + (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(A), W, U, B) + +#define _mm512_maskz_cvt_roundepu32_ps(U, A, B) \ + (__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(A), (__v16sf)_mm512_setzero_ps(), U, B) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_extractf64x4_pd (__m512d __A, const int __imm) +{ + return (__m256d) __builtin_ia32_extractf64x4_mask ((__v8df) __A, + __imm, + (__v4df) + _mm256_undefined_pd (), + (__mmask8) -1); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_extractf64x4_pd (__m256d __W, __mmask8 __U, __m512d __A, + const int __imm) +{ + return (__m256d) __builtin_ia32_extractf64x4_mask ((__v8df) __A, + __imm, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_extractf64x4_pd (__mmask8 __U, __m512d __A, const int __imm) +{ + return (__m256d) __builtin_ia32_extractf64x4_mask ((__v8df) __A, + __imm, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_extractf32x4_ps (__m512 __A, const int __imm) +{ + return (__m128) __builtin_ia32_extractf32x4_mask ((__v16sf) __A, + __imm, + (__v4sf) + _mm_undefined_ps (), + (__mmask8) -1); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_extractf32x4_ps (__m128 __W, __mmask8 __U, __m512 __A, + const int __imm) +{ + return (__m128) __builtin_ia32_extractf32x4_mask ((__v16sf) __A, + __imm, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_extractf32x4_ps (__mmask8 __U, __m512 __A, const int __imm) +{ + return (__m128) __builtin_ia32_extractf32x4_mask ((__v16sf) __A, + __imm, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_extracti64x4_epi64 (__m512i __A, const int __imm) +{ + return (__m256i) __builtin_ia32_extracti64x4_mask ((__v8di) __A, + __imm, + (__v4di) + _mm256_undefined_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_extracti64x4_epi64 (__m256i __W, __mmask8 __U, __m512i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_extracti64x4_mask ((__v8di) __A, + __imm, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_extracti64x4_epi64 (__mmask8 __U, __m512i __A, const int __imm) +{ + return (__m256i) __builtin_ia32_extracti64x4_mask ((__v8di) __A, + __imm, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_extracti32x4_epi32 (__m512i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_extracti32x4_mask ((__v16si) __A, + __imm, + (__v4si) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_extracti32x4_epi32 (__m128i __W, __mmask8 __U, __m512i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_extracti32x4_mask ((__v16si) __A, + __imm, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_extracti32x4_epi32 (__mmask8 __U, __m512i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_extracti32x4_mask ((__v16si) __A, + __imm, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} +#else + +#define _mm512_extractf64x4_pd(X, C) \ + ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (X), \ + (int) (C),\ + (__v4df)(__m256d)_mm256_undefined_pd(),\ + (__mmask8)-1)) + +#define _mm512_mask_extractf64x4_pd(W, U, X, C) \ + ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (X), \ + (int) (C),\ + (__v4df)(__m256d)(W),\ + (__mmask8)(U))) + +#define _mm512_maskz_extractf64x4_pd(U, X, C) \ + ((__m256d) __builtin_ia32_extractf64x4_mask ((__v8df)(__m512d) (X), \ + (int) (C),\ + (__v4df)(__m256d)_mm256_setzero_pd(),\ + (__mmask8)(U))) + +#define _mm512_extractf32x4_ps(X, C) \ + ((__m128) __builtin_ia32_extractf32x4_mask ((__v16sf)(__m512) (X), \ + (int) (C),\ + (__v4sf)(__m128)_mm_undefined_ps(),\ + (__mmask8)-1)) + +#define _mm512_mask_extractf32x4_ps(W, U, X, C) \ + ((__m128) __builtin_ia32_extractf32x4_mask ((__v16sf)(__m512) (X), \ + (int) (C),\ + (__v4sf)(__m128)(W),\ + (__mmask8)(U))) + +#define _mm512_maskz_extractf32x4_ps(U, X, C) \ + ((__m128) __builtin_ia32_extractf32x4_mask ((__v16sf)(__m512) (X), \ + (int) (C),\ + (__v4sf)(__m128)_mm_setzero_ps(),\ + (__mmask8)(U))) + +#define _mm512_extracti64x4_epi64(X, C) \ + ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (X), \ + (int) (C),\ + (__v4di)(__m256i)_mm256_undefined_si256 (),\ + (__mmask8)-1)) + +#define _mm512_mask_extracti64x4_epi64(W, U, X, C) \ + ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (X), \ + (int) (C),\ + (__v4di)(__m256i)(W),\ + (__mmask8)(U))) + +#define _mm512_maskz_extracti64x4_epi64(U, X, C) \ + ((__m256i) __builtin_ia32_extracti64x4_mask ((__v8di)(__m512i) (X), \ + (int) (C),\ + (__v4di)(__m256i)_mm256_setzero_si256 (),\ + (__mmask8)(U))) + +#define _mm512_extracti32x4_epi32(X, C) \ + ((__m128i) __builtin_ia32_extracti32x4_mask ((__v16si)(__m512i) (X), \ + (int) (C),\ + (__v4si)(__m128i)_mm_undefined_si128 (),\ + (__mmask8)-1)) + +#define _mm512_mask_extracti32x4_epi32(W, U, X, C) \ + ((__m128i) __builtin_ia32_extracti32x4_mask ((__v16si)(__m512i) (X), \ + (int) (C),\ + (__v4si)(__m128i)(W),\ + (__mmask8)(U))) + +#define _mm512_maskz_extracti32x4_epi32(U, X, C) \ + ((__m128i) __builtin_ia32_extracti32x4_mask ((__v16si)(__m512i) (X), \ + (int) (C),\ + (__v4si)(__m128i)_mm_setzero_si128 (),\ + (__mmask8)(U))) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_inserti32x4 (__m512i __A, __m128i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_inserti32x4_mask ((__v16si) __A, + (__v4si) __B, + __imm, + (__v16si) __A, -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_insertf32x4 (__m512 __A, __m128 __B, const int __imm) +{ + return (__m512) __builtin_ia32_insertf32x4_mask ((__v16sf) __A, + (__v4sf) __B, + __imm, + (__v16sf) __A, -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_inserti64x4 (__m512i __A, __m256i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_inserti64x4_mask ((__v8di) __A, + (__v4di) __B, + __imm, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_inserti64x4 (__m512i __W, __mmask8 __U, __m512i __A, + __m256i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_inserti64x4_mask ((__v8di) __A, + (__v4di) __B, + __imm, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_inserti64x4 (__mmask8 __U, __m512i __A, __m256i __B, + const int __imm) +{ + return (__m512i) __builtin_ia32_inserti64x4_mask ((__v8di) __A, + (__v4di) __B, + __imm, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_insertf64x4 (__m512d __A, __m256d __B, const int __imm) +{ + return (__m512d) __builtin_ia32_insertf64x4_mask ((__v8df) __A, + (__v4df) __B, + __imm, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_insertf64x4 (__m512d __W, __mmask8 __U, __m512d __A, + __m256d __B, const int __imm) +{ + return (__m512d) __builtin_ia32_insertf64x4_mask ((__v8df) __A, + (__v4df) __B, + __imm, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_insertf64x4 (__mmask8 __U, __m512d __A, __m256d __B, + const int __imm) +{ + return (__m512d) __builtin_ia32_insertf64x4_mask ((__v8df) __A, + (__v4df) __B, + __imm, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} +#else +#define _mm512_insertf32x4(X, Y, C) \ + ((__m512) __builtin_ia32_insertf32x4_mask ((__v16sf)(__m512) (X), \ + (__v4sf)(__m128) (Y), (int) (C), (__v16sf)(__m512) (X), (__mmask16)(-1))) + +#define _mm512_inserti32x4(X, Y, C) \ + ((__m512i) __builtin_ia32_inserti32x4_mask ((__v16si)(__m512i) (X), \ + (__v4si)(__m128i) (Y), (int) (C), (__v16si)(__m512i) (X), (__mmask16)(-1))) + +#define _mm512_insertf64x4(X, Y, C) \ + ((__m512d) __builtin_ia32_insertf64x4_mask ((__v8df)(__m512d) (X), \ + (__v4df)(__m256d) (Y), (int) (C), \ + (__v8df)(__m512d)_mm512_undefined_pd(), \ + (__mmask8)-1)) + +#define _mm512_mask_insertf64x4(W, U, X, Y, C) \ + ((__m512d) __builtin_ia32_insertf64x4_mask ((__v8df)(__m512d) (X), \ + (__v4df)(__m256d) (Y), (int) (C), \ + (__v8df)(__m512d)(W), \ + (__mmask8)(U))) + +#define _mm512_maskz_insertf64x4(U, X, Y, C) \ + ((__m512d) __builtin_ia32_insertf64x4_mask ((__v8df)(__m512d) (X), \ + (__v4df)(__m256d) (Y), (int) (C), \ + (__v8df)(__m512d)_mm512_setzero_pd(), \ + (__mmask8)(U))) + +#define _mm512_inserti64x4(X, Y, C) \ + ((__m512i) __builtin_ia32_inserti64x4_mask ((__v8di)(__m512i) (X), \ + (__v4di)(__m256i) (Y), (int) (C), \ + (__v8di)(__m512i)_mm512_undefined_epi32 (), \ + (__mmask8)-1)) + +#define _mm512_mask_inserti64x4(W, U, X, Y, C) \ + ((__m512i) __builtin_ia32_inserti64x4_mask ((__v8di)(__m512i) (X), \ + (__v4di)(__m256i) (Y), (int) (C),\ + (__v8di)(__m512i)(W),\ + (__mmask8)(U))) + +#define _mm512_maskz_inserti64x4(U, X, Y, C) \ + ((__m512i) __builtin_ia32_inserti64x4_mask ((__v8di)(__m512i) (X), \ + (__v4di)(__m256i) (Y), (int) (C), \ + (__v8di)(__m512i)_mm512_setzero_si512 (), \ + (__mmask8)(U))) +#endif + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_loadu_pd (void const *__P) +{ + return *(__m512d_u *)__P; +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void const *__P) +{ + return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_loadu_pd (__mmask8 __U, void const *__P) +{ + return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_storeu_pd (void *__P, __m512d __A) +{ + *(__m512d_u *)__P = __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_storeu_pd (void *__P, __mmask8 __U, __m512d __A) +{ + __builtin_ia32_storeupd512_mask ((double *) __P, (__v8df) __A, + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_loadu_ps (void const *__P) +{ + return *(__m512_u *)__P; +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void const *__P) +{ + return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_loadu_ps (__mmask16 __U, void const *__P) +{ + return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_storeu_ps (void *__P, __m512 __A) +{ + *(__m512_u *)__P = __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_storeu_ps (void *__P, __mmask16 __U, __m512 __A) +{ + __builtin_ia32_storeups512_mask ((float *) __P, (__v16sf) __A, + (__mmask16) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_load_ss (__m128 __W, __mmask8 __U, const float *__P) +{ + return (__m128) __builtin_ia32_loadss_mask (__P, (__v4sf) __W, __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_load_ss (__mmask8 __U, const float *__P) +{ + return (__m128) __builtin_ia32_loadss_mask (__P, (__v4sf) _mm_setzero_ps (), + __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_load_sd (__m128d __W, __mmask8 __U, const double *__P) +{ + return (__m128d) __builtin_ia32_loadsd_mask (__P, (__v2df) __W, __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_load_sd (__mmask8 __U, const double *__P) +{ + return (__m128d) __builtin_ia32_loadsd_mask (__P, (__v2df) _mm_setzero_pd (), + __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_movess_mask ((__v4sf) __A, (__v4sf) __B, + (__v4sf) __W, __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_movess_mask ((__v4sf) __A, (__v4sf) __B, + (__v4sf) _mm_setzero_ps (), __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_movesd_mask ((__v2df) __A, (__v2df) __B, + (__v2df) __W, __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_movesd_mask ((__v2df) __A, (__v2df) __B, + (__v2df) _mm_setzero_pd (), + __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_store_ss (float *__P, __mmask8 __U, __m128 __A) +{ + __builtin_ia32_storess_mask (__P, (__v4sf) __A, (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_store_sd (double *__P, __mmask8 __U, __m128d __A) +{ + __builtin_ia32_storesd_mask (__P, (__v2df) __A, (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_loadu_epi64 (void const *__P) +{ + return *(__m512i_u *) __P; +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_loadu_epi64 (__mmask8 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_storeu_epi64 (void *__P, __m512i __A) +{ + *(__m512i_u *) __P = (__m512i_u) __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_storeu_epi64 (void *__P, __mmask8 __U, __m512i __A) +{ + __builtin_ia32_storedqudi512_mask ((long long *) __P, (__v8di) __A, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_loadu_si512 (void const *__P) +{ + return *(__m512i_u *)__P; +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_loadu_epi32 (void const *__P) +{ + return *(__m512i_u *) __P; +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_loadu_epi32 (__mmask16 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_storeu_si512 (void *__P, __m512i __A) +{ + *(__m512i_u *)__P = __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_storeu_epi32 (void *__P, __m512i __A) +{ + *(__m512i_u *) __P = (__m512i_u) __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_storeu_epi32 (void *__P, __mmask16 __U, __m512i __A) +{ + __builtin_ia32_storedqusi512_mask ((int *) __P, (__v16si) __A, + (__mmask16) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutevar_pd (__m512d __A, __m512i __C) +{ + return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A, + (__v8di) __C, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutevar_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512i __C) +{ + return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A, + (__v8di) __C, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutevar_pd (__mmask8 __U, __m512d __A, __m512i __C) +{ + return (__m512d) __builtin_ia32_vpermilvarpd512_mask ((__v8df) __A, + (__v8di) __C, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutevar_ps (__m512 __A, __m512i __C) +{ + return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A, + (__v16si) __C, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutevar_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512i __C) +{ + return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A, + (__v16si) __C, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutevar_ps (__mmask16 __U, __m512 __A, __m512i __C) +{ + return (__m512) __builtin_ia32_vpermilvarps512_mask ((__v16sf) __A, + (__v16si) __C, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutex2var_epi64 (__m512i __A, __m512i __I, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I + /* idx */ , + (__v8di) __A, + (__v8di) __B, + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutex2var_epi64 (__m512i __A, __mmask8 __U, __m512i __I, + __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I + /* idx */ , + (__v8di) __A, + (__v8di) __B, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask2_permutex2var_epi64 (__m512i __A, __m512i __I, + __mmask8 __U, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermi2varq512_mask ((__v8di) __A, + (__v8di) __I + /* idx */ , + (__v8di) __B, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutex2var_epi64 (__mmask8 __U, __m512i __A, + __m512i __I, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermt2varq512_maskz ((__v8di) __I + /* idx */ , + (__v8di) __A, + (__v8di) __B, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutex2var_epi32 (__m512i __A, __m512i __I, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I + /* idx */ , + (__v16si) __A, + (__v16si) __B, + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutex2var_epi32 (__m512i __A, __mmask16 __U, + __m512i __I, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I + /* idx */ , + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask2_permutex2var_epi32 (__m512i __A, __m512i __I, + __mmask16 __U, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermi2vard512_mask ((__v16si) __A, + (__v16si) __I + /* idx */ , + (__v16si) __B, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutex2var_epi32 (__mmask16 __U, __m512i __A, + __m512i __I, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermt2vard512_maskz ((__v16si) __I + /* idx */ , + (__v16si) __A, + (__v16si) __B, + (__mmask16) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutex2var_pd (__m512d __A, __m512i __I, __m512d __B) +{ + return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I + /* idx */ , + (__v8df) __A, + (__v8df) __B, + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutex2var_pd (__m512d __A, __mmask8 __U, __m512i __I, + __m512d __B) +{ + return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I + /* idx */ , + (__v8df) __A, + (__v8df) __B, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask2_permutex2var_pd (__m512d __A, __m512i __I, __mmask8 __U, + __m512d __B) +{ + return (__m512d) __builtin_ia32_vpermi2varpd512_mask ((__v8df) __A, + (__v8di) __I + /* idx */ , + (__v8df) __B, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutex2var_pd (__mmask8 __U, __m512d __A, __m512i __I, + __m512d __B) +{ + return (__m512d) __builtin_ia32_vpermt2varpd512_maskz ((__v8di) __I + /* idx */ , + (__v8df) __A, + (__v8df) __B, + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutex2var_ps (__m512 __A, __m512i __I, __m512 __B) +{ + return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I + /* idx */ , + (__v16sf) __A, + (__v16sf) __B, + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutex2var_ps (__m512 __A, __mmask16 __U, __m512i __I, __m512 __B) +{ + return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I + /* idx */ , + (__v16sf) __A, + (__v16sf) __B, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask2_permutex2var_ps (__m512 __A, __m512i __I, __mmask16 __U, + __m512 __B) +{ + return (__m512) __builtin_ia32_vpermi2varps512_mask ((__v16sf) __A, + (__v16si) __I + /* idx */ , + (__v16sf) __B, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutex2var_ps (__mmask16 __U, __m512 __A, __m512i __I, + __m512 __B) +{ + return (__m512) __builtin_ia32_vpermt2varps512_maskz ((__v16si) __I + /* idx */ , + (__v16sf) __A, + (__v16sf) __B, + (__mmask16) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permute_pd (__m512d __X, const int __C) +{ + return (__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df) __X, __C, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permute_pd (__m512d __W, __mmask8 __U, __m512d __X, const int __C) +{ + return (__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df) __X, __C, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permute_pd (__mmask8 __U, __m512d __X, const int __C) +{ + return (__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df) __X, __C, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permute_ps (__m512 __X, const int __C) +{ + return (__m512) __builtin_ia32_vpermilps512_mask ((__v16sf) __X, __C, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permute_ps (__m512 __W, __mmask16 __U, __m512 __X, const int __C) +{ + return (__m512) __builtin_ia32_vpermilps512_mask ((__v16sf) __X, __C, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permute_ps (__mmask16 __U, __m512 __X, const int __C) +{ + return (__m512) __builtin_ia32_vpermilps512_mask ((__v16sf) __X, __C, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} +#else +#define _mm512_permute_pd(X, C) \ + ((__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df)(__m512d)(X), (int)(C), \ + (__v8df)(__m512d)_mm512_undefined_pd(),\ + (__mmask8)(-1))) + +#define _mm512_mask_permute_pd(W, U, X, C) \ + ((__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df)(__m512d)(X), (int)(C), \ + (__v8df)(__m512d)(W), \ + (__mmask8)(U))) + +#define _mm512_maskz_permute_pd(U, X, C) \ + ((__m512d) __builtin_ia32_vpermilpd512_mask ((__v8df)(__m512d)(X), (int)(C), \ + (__v8df)(__m512d)_mm512_setzero_pd(), \ + (__mmask8)(U))) + +#define _mm512_permute_ps(X, C) \ + ((__m512) __builtin_ia32_vpermilps512_mask ((__v16sf)(__m512)(X), (int)(C), \ + (__v16sf)(__m512)_mm512_undefined_ps(),\ + (__mmask16)(-1))) + +#define _mm512_mask_permute_ps(W, U, X, C) \ + ((__m512) __builtin_ia32_vpermilps512_mask ((__v16sf)(__m512)(X), (int)(C), \ + (__v16sf)(__m512)(W), \ + (__mmask16)(U))) + +#define _mm512_maskz_permute_ps(U, X, C) \ + ((__m512) __builtin_ia32_vpermilps512_mask ((__v16sf)(__m512)(X), (int)(C), \ + (__v16sf)(__m512)_mm512_setzero_ps(), \ + (__mmask16)(U))) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutex_epi64 (__m512i __X, const int __I) +{ + return (__m512i) __builtin_ia32_permdi512_mask ((__v8di) __X, __I, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) (-1)); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutex_epi64 (__m512i __W, __mmask8 __M, + __m512i __X, const int __I) +{ + return (__m512i) __builtin_ia32_permdi512_mask ((__v8di) __X, __I, + (__v8di) __W, + (__mmask8) __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutex_epi64 (__mmask8 __M, __m512i __X, const int __I) +{ + return (__m512i) __builtin_ia32_permdi512_mask ((__v8di) __X, __I, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __M); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutex_pd (__m512d __X, const int __M) +{ + return (__m512d) __builtin_ia32_permdf512_mask ((__v8df) __X, __M, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutex_pd (__m512d __W, __mmask8 __U, __m512d __X, const int __M) +{ + return (__m512d) __builtin_ia32_permdf512_mask ((__v8df) __X, __M, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutex_pd (__mmask8 __U, __m512d __X, const int __M) +{ + return (__m512d) __builtin_ia32_permdf512_mask ((__v8df) __X, __M, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} +#else +#define _mm512_permutex_pd(X, M) \ + ((__m512d) __builtin_ia32_permdf512_mask ((__v8df)(__m512d)(X), (int)(M), \ + (__v8df)(__m512d)_mm512_undefined_pd(),\ + (__mmask8)-1)) + +#define _mm512_mask_permutex_pd(W, U, X, M) \ + ((__m512d) __builtin_ia32_permdf512_mask ((__v8df)(__m512d)(X), (int)(M), \ + (__v8df)(__m512d)(W), (__mmask8)(U))) + +#define _mm512_maskz_permutex_pd(U, X, M) \ + ((__m512d) __builtin_ia32_permdf512_mask ((__v8df)(__m512d)(X), (int)(M), \ + (__v8df)(__m512d)_mm512_setzero_pd(),\ + (__mmask8)(U))) + +#define _mm512_permutex_epi64(X, I) \ + ((__m512i) __builtin_ia32_permdi512_mask ((__v8di)(__m512i)(X), \ + (int)(I), \ + (__v8di)(__m512i) \ + (_mm512_undefined_epi32 ()),\ + (__mmask8)(-1))) + +#define _mm512_maskz_permutex_epi64(M, X, I) \ + ((__m512i) __builtin_ia32_permdi512_mask ((__v8di)(__m512i)(X), \ + (int)(I), \ + (__v8di)(__m512i) \ + (_mm512_setzero_si512 ()),\ + (__mmask8)(M))) + +#define _mm512_mask_permutex_epi64(W, M, X, I) \ + ((__m512i) __builtin_ia32_permdi512_mask ((__v8di)(__m512i)(X), \ + (int)(I), \ + (__v8di)(__m512i)(W), \ + (__mmask8)(M))) +#endif + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y, + (__v8di) __X, + (__v8di) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutexvar_epi64 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y, + (__v8di) __X, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X, + __m512i __Y) +{ + return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y, + (__v8di) __X, + (__v8di) __W, + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y, + (__v16si) __X, + (__v16si) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutexvar_epi32 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y, + (__v16si) __X, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X, + __m512i __Y) +{ + return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y, + (__v16si) __X, + (__v16si) __W, + __M); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutexvar_pd (__m512i __X, __m512d __Y) +{ + return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y, + (__v8di) __X, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y) +{ + return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y, + (__v8di) __X, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y) +{ + return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y, + (__v8di) __X, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutexvar_ps (__m512i __X, __m512 __Y) +{ + return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y, + (__v16si) __X, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y) +{ + return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y, + (__v16si) __X, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y) +{ + return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y, + (__v16si) __X, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shuffle_ps (__m512 __M, __m512 __V, const int __imm) +{ + return (__m512) __builtin_ia32_shufps512_mask ((__v16sf) __M, + (__v16sf) __V, __imm, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shuffle_ps (__m512 __W, __mmask16 __U, __m512 __M, + __m512 __V, const int __imm) +{ + return (__m512) __builtin_ia32_shufps512_mask ((__v16sf) __M, + (__v16sf) __V, __imm, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shuffle_ps (__mmask16 __U, __m512 __M, __m512 __V, const int __imm) +{ + return (__m512) __builtin_ia32_shufps512_mask ((__v16sf) __M, + (__v16sf) __V, __imm, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shuffle_pd (__m512d __M, __m512d __V, const int __imm) +{ + return (__m512d) __builtin_ia32_shufpd512_mask ((__v8df) __M, + (__v8df) __V, __imm, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shuffle_pd (__m512d __W, __mmask8 __U, __m512d __M, + __m512d __V, const int __imm) +{ + return (__m512d) __builtin_ia32_shufpd512_mask ((__v8df) __M, + (__v8df) __V, __imm, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shuffle_pd (__mmask8 __U, __m512d __M, __m512d __V, + const int __imm) +{ + return (__m512d) __builtin_ia32_shufpd512_mask ((__v8df) __M, + (__v8df) __V, __imm, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fixupimm_round_pd (__m512d __A, __m512d __B, __m512i __C, + const int __imm, const int __R) +{ + return (__m512d) __builtin_ia32_fixupimmpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8di) __C, + __imm, + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fixupimm_round_pd (__m512d __A, __mmask8 __U, __m512d __B, + __m512i __C, const int __imm, const int __R) +{ + return (__m512d) __builtin_ia32_fixupimmpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8di) __C, + __imm, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fixupimm_round_pd (__mmask8 __U, __m512d __A, __m512d __B, + __m512i __C, const int __imm, const int __R) +{ + return (__m512d) __builtin_ia32_fixupimmpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8di) __C, + __imm, + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fixupimm_round_ps (__m512 __A, __m512 __B, __m512i __C, + const int __imm, const int __R) +{ + return (__m512) __builtin_ia32_fixupimmps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16si) __C, + __imm, + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fixupimm_round_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512i __C, const int __imm, const int __R) +{ + return (__m512) __builtin_ia32_fixupimmps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16si) __C, + __imm, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fixupimm_round_ps (__mmask16 __U, __m512 __A, __m512 __B, + __m512i __C, const int __imm, const int __R) +{ + return (__m512) __builtin_ia32_fixupimmps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16si) __C, + __imm, + (__mmask16) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fixupimm_round_sd (__m128d __A, __m128d __B, __m128i __C, + const int __imm, const int __R) +{ + return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, __imm, + (__mmask8) -1, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fixupimm_round_sd (__m128d __A, __mmask8 __U, __m128d __B, + __m128i __C, const int __imm, const int __R) +{ + return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, __imm, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fixupimm_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + __m128i __C, const int __imm, const int __R) +{ + return (__m128d) __builtin_ia32_fixupimmsd_maskz ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, + __imm, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fixupimm_round_ss (__m128 __A, __m128 __B, __m128i __C, + const int __imm, const int __R) +{ + return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, __imm, + (__mmask8) -1, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fixupimm_round_ss (__m128 __A, __mmask8 __U, __m128 __B, + __m128i __C, const int __imm, const int __R) +{ + return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, __imm, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fixupimm_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + __m128i __C, const int __imm, const int __R) +{ + return (__m128) __builtin_ia32_fixupimmss_maskz ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, __imm, + (__mmask8) __U, __R); +} + +#else +#define _mm512_shuffle_pd(X, Y, C) \ + ((__m512d)__builtin_ia32_shufpd512_mask ((__v8df)(__m512d)(X), \ + (__v8df)(__m512d)(Y), (int)(C),\ + (__v8df)(__m512d)_mm512_undefined_pd(),\ + (__mmask8)-1)) + +#define _mm512_mask_shuffle_pd(W, U, X, Y, C) \ + ((__m512d)__builtin_ia32_shufpd512_mask ((__v8df)(__m512d)(X), \ + (__v8df)(__m512d)(Y), (int)(C),\ + (__v8df)(__m512d)(W),\ + (__mmask8)(U))) + +#define _mm512_maskz_shuffle_pd(U, X, Y, C) \ + ((__m512d)__builtin_ia32_shufpd512_mask ((__v8df)(__m512d)(X), \ + (__v8df)(__m512d)(Y), (int)(C),\ + (__v8df)(__m512d)_mm512_setzero_pd(),\ + (__mmask8)(U))) + +#define _mm512_shuffle_ps(X, Y, C) \ + ((__m512)__builtin_ia32_shufps512_mask ((__v16sf)(__m512)(X), \ + (__v16sf)(__m512)(Y), (int)(C),\ + (__v16sf)(__m512)_mm512_undefined_ps(),\ + (__mmask16)-1)) + +#define _mm512_mask_shuffle_ps(W, U, X, Y, C) \ + ((__m512)__builtin_ia32_shufps512_mask ((__v16sf)(__m512)(X), \ + (__v16sf)(__m512)(Y), (int)(C),\ + (__v16sf)(__m512)(W),\ + (__mmask16)(U))) + +#define _mm512_maskz_shuffle_ps(U, X, Y, C) \ + ((__m512)__builtin_ia32_shufps512_mask ((__v16sf)(__m512)(X), \ + (__v16sf)(__m512)(Y), (int)(C),\ + (__v16sf)(__m512)_mm512_setzero_ps(),\ + (__mmask16)(U))) + +#define _mm512_fixupimm_round_pd(X, Y, Z, C, R) \ + ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X), \ + (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C), \ + (__mmask8)(-1), (R))) + +#define _mm512_mask_fixupimm_round_pd(X, U, Y, Z, C, R) \ + ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X), \ + (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C), \ + (__mmask8)(U), (R))) + +#define _mm512_maskz_fixupimm_round_pd(U, X, Y, Z, C, R) \ + ((__m512d)__builtin_ia32_fixupimmpd512_maskz ((__v8df)(__m512d)(X), \ + (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C), \ + (__mmask8)(U), (R))) + +#define _mm512_fixupimm_round_ps(X, Y, Z, C, R) \ + ((__m512)__builtin_ia32_fixupimmps512_mask ((__v16sf)(__m512)(X), \ + (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C), \ + (__mmask16)(-1), (R))) + +#define _mm512_mask_fixupimm_round_ps(X, U, Y, Z, C, R) \ + ((__m512)__builtin_ia32_fixupimmps512_mask ((__v16sf)(__m512)(X), \ + (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C), \ + (__mmask16)(U), (R))) + +#define _mm512_maskz_fixupimm_round_ps(U, X, Y, Z, C, R) \ + ((__m512)__builtin_ia32_fixupimmps512_maskz ((__v16sf)(__m512)(X), \ + (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C), \ + (__mmask16)(U), (R))) + +#define _mm_fixupimm_round_sd(X, Y, Z, C, R) \ + ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), \ + (__mmask8)(-1), (R))) + +#define _mm_mask_fixupimm_round_sd(X, U, Y, Z, C, R) \ + ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), \ + (__mmask8)(U), (R))) + +#define _mm_maskz_fixupimm_round_sd(U, X, Y, Z, C, R) \ + ((__m128d)__builtin_ia32_fixupimmsd_maskz ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), \ + (__mmask8)(U), (R))) + +#define _mm_fixupimm_round_ss(X, Y, Z, C, R) \ + ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), \ + (__mmask8)(-1), (R))) + +#define _mm_mask_fixupimm_round_ss(X, U, Y, Z, C, R) \ + ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), \ + (__mmask8)(U), (R))) + +#define _mm_maskz_fixupimm_round_ss(U, X, Y, Z, C, R) \ + ((__m128)__builtin_ia32_fixupimmss_maskz ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), \ + (__mmask8)(U), (R))) +#endif + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_movehdup_ps (__m512 __A) +{ + return (__m512) __builtin_ia32_movshdup512_mask ((__v16sf) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_movshdup512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_movshdup512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_moveldup_ps (__m512 __A) +{ + return (__m512) __builtin_ia32_movsldup512_mask ((__v16sf) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_movsldup512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_movsldup512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_or_si512 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v16su) __A | (__v16su) __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_or_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v16su) __A | (__v16su) __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_or_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pord512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_or_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pord512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_or_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v8du) __A | (__v8du) __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_or_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_or_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_porq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_xor_si512 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v16su) __A ^ (__v16su) __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_xor_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v16su) __A ^ (__v16su) __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_xor_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pxord512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_xor_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pxord512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_xor_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v8du) __A ^ (__v8du) __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_xor_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_xor_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pxorq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rol_epi32 (__m512i __A, const int __B) +{ + return (__m512i) __builtin_ia32_prold512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rol_epi32 (__m512i __W, __mmask16 __U, __m512i __A, const int __B) +{ + return (__m512i) __builtin_ia32_prold512_mask ((__v16si) __A, __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rol_epi32 (__mmask16 __U, __m512i __A, const int __B) +{ + return (__m512i) __builtin_ia32_prold512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_ror_epi32 (__m512i __A, int __B) +{ + return (__m512i) __builtin_ia32_prord512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_ror_epi32 (__m512i __W, __mmask16 __U, __m512i __A, int __B) +{ + return (__m512i) __builtin_ia32_prord512_mask ((__v16si) __A, __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_ror_epi32 (__mmask16 __U, __m512i __A, int __B) +{ + return (__m512i) __builtin_ia32_prord512_mask ((__v16si) __A, __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rol_epi64 (__m512i __A, const int __B) +{ + return (__m512i) __builtin_ia32_prolq512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rol_epi64 (__m512i __W, __mmask8 __U, __m512i __A, const int __B) +{ + return (__m512i) __builtin_ia32_prolq512_mask ((__v8di) __A, __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rol_epi64 (__mmask8 __U, __m512i __A, const int __B) +{ + return (__m512i) __builtin_ia32_prolq512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_ror_epi64 (__m512i __A, int __B) +{ + return (__m512i) __builtin_ia32_prorq512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_ror_epi64 (__m512i __W, __mmask8 __U, __m512i __A, int __B) +{ + return (__m512i) __builtin_ia32_prorq512_mask ((__v8di) __A, __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_ror_epi64 (__mmask8 __U, __m512i __A, int __B) +{ + return (__m512i) __builtin_ia32_prorq512_mask ((__v8di) __A, __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +#else +#define _mm512_rol_epi32(A, B) \ + ((__m512i)__builtin_ia32_prold512_mask ((__v16si)(__m512i)(A), \ + (int)(B), \ + (__v16si)_mm512_undefined_epi32 (), \ + (__mmask16)(-1))) +#define _mm512_mask_rol_epi32(W, U, A, B) \ + ((__m512i)__builtin_ia32_prold512_mask ((__v16si)(__m512i)(A), \ + (int)(B), \ + (__v16si)(__m512i)(W), \ + (__mmask16)(U))) +#define _mm512_maskz_rol_epi32(U, A, B) \ + ((__m512i)__builtin_ia32_prold512_mask ((__v16si)(__m512i)(A), \ + (int)(B), \ + (__v16si)_mm512_setzero_si512 (), \ + (__mmask16)(U))) +#define _mm512_ror_epi32(A, B) \ + ((__m512i)__builtin_ia32_prord512_mask ((__v16si)(__m512i)(A), \ + (int)(B), \ + (__v16si)_mm512_undefined_epi32 (), \ + (__mmask16)(-1))) +#define _mm512_mask_ror_epi32(W, U, A, B) \ + ((__m512i)__builtin_ia32_prord512_mask ((__v16si)(__m512i)(A), \ + (int)(B), \ + (__v16si)(__m512i)(W), \ + (__mmask16)(U))) +#define _mm512_maskz_ror_epi32(U, A, B) \ + ((__m512i)__builtin_ia32_prord512_mask ((__v16si)(__m512i)(A), \ + (int)(B), \ + (__v16si)_mm512_setzero_si512 (), \ + (__mmask16)(U))) +#define _mm512_rol_epi64(A, B) \ + ((__m512i)__builtin_ia32_prolq512_mask ((__v8di)(__m512i)(A), \ + (int)(B), \ + (__v8di)_mm512_undefined_epi32 (), \ + (__mmask8)(-1))) +#define _mm512_mask_rol_epi64(W, U, A, B) \ + ((__m512i)__builtin_ia32_prolq512_mask ((__v8di)(__m512i)(A), \ + (int)(B), \ + (__v8di)(__m512i)(W), \ + (__mmask8)(U))) +#define _mm512_maskz_rol_epi64(U, A, B) \ + ((__m512i)__builtin_ia32_prolq512_mask ((__v8di)(__m512i)(A), \ + (int)(B), \ + (__v8di)_mm512_setzero_si512 (), \ + (__mmask8)(U))) + +#define _mm512_ror_epi64(A, B) \ + ((__m512i)__builtin_ia32_prorq512_mask ((__v8di)(__m512i)(A), \ + (int)(B), \ + (__v8di)_mm512_undefined_epi32 (), \ + (__mmask8)(-1))) +#define _mm512_mask_ror_epi64(W, U, A, B) \ + ((__m512i)__builtin_ia32_prorq512_mask ((__v8di)(__m512i)(A), \ + (int)(B), \ + (__v8di)(__m512i)(W), \ + (__mmask8)(U))) +#define _mm512_maskz_ror_epi64(U, A, B) \ + ((__m512i)__builtin_ia32_prorq512_mask ((__v8di)(__m512i)(A), \ + (int)(B), \ + (__v8di)_mm512_setzero_si512 (), \ + (__mmask8)(U))) +#endif + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_and_si512 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v16su) __A & (__v16su) __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_and_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v16su) __A & (__v16su) __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_and_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_and_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_and_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v8du) __A & (__v8du) __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_and_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_and_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_pd (), + __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_andnot_si512 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_andnot_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_andnot_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_andnot_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandnd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_andnot_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_andnot_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_andnot_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pandnq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_pd (), + __U); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_test_epi32_mask (__m512i __A, __m512i __B) +{ + return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A, + (__v16si) __B, + (__mmask16) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A, + (__v16si) __B, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_test_epi64_mask (__m512i __A, __m512i __B) +{ + return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A, + (__v8di) __B, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A, (__v8di) __B, __U); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_testn_epi32_mask (__m512i __A, __m512i __B) +{ + return (__mmask16) __builtin_ia32_ptestnmd512 ((__v16si) __A, + (__v16si) __B, + (__mmask16) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__mmask16) __builtin_ia32_ptestnmd512 ((__v16si) __A, + (__v16si) __B, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_testn_epi64_mask (__m512i __A, __m512i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmq512 ((__v8di) __A, + (__v8di) __B, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmq512 ((__v8di) __A, + (__v8di) __B, __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_abs_ps (__m512 __A) +{ + return (__m512) _mm512_and_epi32 ((__m512i) __A, + _mm512_set1_epi32 (0x7fffffff)); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_abs_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) _mm512_mask_and_epi32 ((__m512i) __W, __U, (__m512i) __A, + _mm512_set1_epi32 (0x7fffffff)); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_abs_pd (__m512d __A) +{ + return (__m512d) _mm512_and_epi64 ((__m512i) __A, + _mm512_set1_epi64 (0x7fffffffffffffffLL)); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_abs_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) + _mm512_mask_and_epi64 ((__m512i) __W, __U, (__m512i) __A, + _mm512_set1_epi64 (0x7fffffffffffffffLL)); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_unpackhi_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckhdq512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_unpackhi_epi32 (__m512i __W, __mmask16 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckhdq512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpackhi_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckhdq512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_unpackhi_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckhqdq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_unpackhi_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckhqdq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpackhi_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckhqdq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_unpacklo_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckldq512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_unpacklo_epi32 (__m512i __W, __mmask16 __U, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckldq512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpacklo_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpckldq512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_unpacklo_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpcklqdq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpcklqdq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_punpcklqdq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +#ifdef __x86_64__ +#ifdef __OPTIMIZE__ +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundss_u64 (__m128 __A, const int __R) +{ + return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf) __A, __R); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundss_si64 (__m128 __A, const int __R) +{ + return (long long) __builtin_ia32_vcvtss2si64 ((__v4sf) __A, __R); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundss_i64 (__m128 __A, const int __R) +{ + return (long long) __builtin_ia32_vcvtss2si64 ((__v4sf) __A, __R); +} + +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundss_u64 (__m128 __A, const int __R) +{ + return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf) __A, __R); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundss_i64 (__m128 __A, const int __R) +{ + return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, __R); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundss_si64 (__m128 __A, const int __R) +{ + return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, __R); +} +#else +#define _mm_cvt_roundss_u64(A, B) \ + ((unsigned long long)__builtin_ia32_vcvtss2usi64(A, B)) + +#define _mm_cvt_roundss_si64(A, B) \ + ((long long)__builtin_ia32_vcvtss2si64(A, B)) + +#define _mm_cvt_roundss_i64(A, B) \ + ((long long)__builtin_ia32_vcvtss2si64(A, B)) + +#define _mm_cvtt_roundss_u64(A, B) \ + ((unsigned long long)__builtin_ia32_vcvttss2usi64(A, B)) + +#define _mm_cvtt_roundss_i64(A, B) \ + ((long long)__builtin_ia32_vcvttss2si64(A, B)) + +#define _mm_cvtt_roundss_si64(A, B) \ + ((long long)__builtin_ia32_vcvttss2si64(A, B)) +#endif +#endif + +#ifdef __OPTIMIZE__ +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundss_u32 (__m128 __A, const int __R) +{ + return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A, __R); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundss_si32 (__m128 __A, const int __R) +{ + return (int) __builtin_ia32_vcvtss2si32 ((__v4sf) __A, __R); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundss_i32 (__m128 __A, const int __R) +{ + return (int) __builtin_ia32_vcvtss2si32 ((__v4sf) __A, __R); +} + +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundss_u32 (__m128 __A, const int __R) +{ + return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A, __R); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundss_i32 (__m128 __A, const int __R) +{ + return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, __R); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundss_si32 (__m128 __A, const int __R) +{ + return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, __R); +} +#else +#define _mm_cvt_roundss_u32(A, B) \ + ((unsigned)__builtin_ia32_vcvtss2usi32(A, B)) + +#define _mm_cvt_roundss_si32(A, B) \ + ((int)__builtin_ia32_vcvtss2si32(A, B)) + +#define _mm_cvt_roundss_i32(A, B) \ + ((int)__builtin_ia32_vcvtss2si32(A, B)) + +#define _mm_cvtt_roundss_u32(A, B) \ + ((unsigned)__builtin_ia32_vcvttss2usi32(A, B)) + +#define _mm_cvtt_roundss_si32(A, B) \ + ((int)__builtin_ia32_vcvttss2si32(A, B)) + +#define _mm_cvtt_roundss_i32(A, B) \ + ((int)__builtin_ia32_vcvttss2si32(A, B)) +#endif + +#ifdef __x86_64__ +#ifdef __OPTIMIZE__ +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsd_u64 (__m128d __A, const int __R) +{ + return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df) __A, __R); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsd_si64 (__m128d __A, const int __R) +{ + return (long long) __builtin_ia32_vcvtsd2si64 ((__v2df) __A, __R); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsd_i64 (__m128d __A, const int __R) +{ + return (long long) __builtin_ia32_vcvtsd2si64 ((__v2df) __A, __R); +} + +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundsd_u64 (__m128d __A, const int __R) +{ + return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df) __A, __R); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundsd_si64 (__m128d __A, const int __R) +{ + return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, __R); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundsd_i64 (__m128d __A, const int __R) +{ + return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, __R); +} +#else +#define _mm_cvt_roundsd_u64(A, B) \ + ((unsigned long long)__builtin_ia32_vcvtsd2usi64(A, B)) + +#define _mm_cvt_roundsd_si64(A, B) \ + ((long long)__builtin_ia32_vcvtsd2si64(A, B)) + +#define _mm_cvt_roundsd_i64(A, B) \ + ((long long)__builtin_ia32_vcvtsd2si64(A, B)) + +#define _mm_cvtt_roundsd_u64(A, B) \ + ((unsigned long long)__builtin_ia32_vcvttsd2usi64(A, B)) + +#define _mm_cvtt_roundsd_si64(A, B) \ + ((long long)__builtin_ia32_vcvttsd2si64(A, B)) + +#define _mm_cvtt_roundsd_i64(A, B) \ + ((long long)__builtin_ia32_vcvttsd2si64(A, B)) +#endif +#endif + +#ifdef __OPTIMIZE__ +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsd_u32 (__m128d __A, const int __R) +{ + return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A, __R); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsd_si32 (__m128d __A, const int __R) +{ + return (int) __builtin_ia32_vcvtsd2si32 ((__v2df) __A, __R); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsd_i32 (__m128d __A, const int __R) +{ + return (int) __builtin_ia32_vcvtsd2si32 ((__v2df) __A, __R); +} + +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundsd_u32 (__m128d __A, const int __R) +{ + return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A, __R); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundsd_i32 (__m128d __A, const int __R) +{ + return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, __R); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundsd_si32 (__m128d __A, const int __R) +{ + return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, __R); +} +#else +#define _mm_cvt_roundsd_u32(A, B) \ + ((unsigned)__builtin_ia32_vcvtsd2usi32(A, B)) + +#define _mm_cvt_roundsd_si32(A, B) \ + ((int)__builtin_ia32_vcvtsd2si32(A, B)) + +#define _mm_cvt_roundsd_i32(A, B) \ + ((int)__builtin_ia32_vcvtsd2si32(A, B)) + +#define _mm_cvtt_roundsd_u32(A, B) \ + ((unsigned)__builtin_ia32_vcvttsd2usi32(A, B)) + +#define _mm_cvtt_roundsd_si32(A, B) \ + ((int)__builtin_ia32_vcvttsd2si32(A, B)) + +#define _mm_cvtt_roundsd_i32(A, B) \ + ((int)__builtin_ia32_vcvttsd2si32(A, B)) +#endif + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_movedup_pd (__m512d __A) +{ + return (__m512d) __builtin_ia32_movddup512_mask ((__v8df) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_movedup_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_movddup512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_movddup512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_unpacklo_pd (__m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_unpcklpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_unpacklo_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_unpcklpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_unpcklpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_unpackhi_pd (__m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_unpckhpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_unpackhi_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_unpckhpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpackhi_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_unpckhpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_unpackhi_ps (__m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_unpckhps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_unpackhi_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_unpckhps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_unpckhps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundps_pd (__m256 __A, const int __R) +{ + return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundps_pd (__m512d __W, __mmask8 __U, __m256 __A, + const int __R) +{ + return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A, + (__v8df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundps_pd (__mmask8 __U, __m256 __A, const int __R) +{ + return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundph_ps (__m256i __A, const int __R) +{ + return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundph_ps (__m512 __W, __mmask16 __U, __m256i __A, + const int __R) +{ + return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, + (__v16sf) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundph_ps (__mmask16 __U, __m256i __A, const int __R) +{ + return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundps_ph (__m512 __A, const int __I) +{ + return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A, + __I, + (__v16hi) + _mm256_undefined_si256 (), + -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtps_ph (__m512 __A, const int __I) +{ + return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A, + __I, + (__v16hi) + _mm256_undefined_si256 (), + -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundps_ph (__m256i __U, __mmask16 __W, __m512 __A, + const int __I) +{ + return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A, + __I, + (__v16hi) __U, + (__mmask16) __W); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtps_ph (__m256i __U, __mmask16 __W, __m512 __A, const int __I) +{ + return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A, + __I, + (__v16hi) __U, + (__mmask16) __W); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundps_ph (__mmask16 __W, __m512 __A, const int __I) +{ + return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A, + __I, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __W); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtps_ph (__mmask16 __W, __m512 __A, const int __I) +{ + return (__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf) __A, + __I, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __W); +} +#else +#define _mm512_cvt_roundps_pd(A, B) \ + (__m512d)__builtin_ia32_cvtps2pd512_mask(A, (__v8df)_mm512_undefined_pd(), -1, B) + +#define _mm512_mask_cvt_roundps_pd(W, U, A, B) \ + (__m512d)__builtin_ia32_cvtps2pd512_mask(A, (__v8df)(W), U, B) + +#define _mm512_maskz_cvt_roundps_pd(U, A, B) \ + (__m512d)__builtin_ia32_cvtps2pd512_mask(A, (__v8df)_mm512_setzero_pd(), U, B) + +#define _mm512_cvt_roundph_ps(A, B) \ + (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(A), (__v16sf)_mm512_undefined_ps(), -1, B) + +#define _mm512_mask_cvt_roundph_ps(W, U, A, B) \ + (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(A), (__v16sf)(W), U, B) + +#define _mm512_maskz_cvt_roundph_ps(U, A, B) \ + (__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(A), (__v16sf)_mm512_setzero_ps(), U, B) + +#define _mm512_cvt_roundps_ph(A, I) \ + ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) (A), (int) (I),\ + (__v16hi)_mm256_undefined_si256 (), -1)) +#define _mm512_cvtps_ph(A, I) \ + ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) (A), (int) (I),\ + (__v16hi)_mm256_undefined_si256 (), -1)) +#define _mm512_mask_cvt_roundps_ph(U, W, A, I) \ + ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) (A), (int) (I),\ + (__v16hi)(__m256i)(U), (__mmask16) (W))) +#define _mm512_mask_cvtps_ph(U, W, A, I) \ + ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) (A), (int) (I),\ + (__v16hi)(__m256i)(U), (__mmask16) (W))) +#define _mm512_maskz_cvt_roundps_ph(W, A, I) \ + ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) (A), (int) (I),\ + (__v16hi)_mm256_setzero_si256 (), (__mmask16) (W))) +#define _mm512_maskz_cvtps_ph(W, A, I) \ + ((__m256i) __builtin_ia32_vcvtps2ph512_mask ((__v16sf)(__m512) (A), (int) (I),\ + (__v16hi)_mm256_setzero_si256 (), (__mmask16) (W))) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundpd_ps (__m512d __A, const int __R) +{ + return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, + (__v8sf) + _mm256_undefined_ps (), + (__mmask8) -1, __R); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundpd_ps (__m256 __W, __mmask8 __U, __m512d __A, + const int __R) +{ + return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, + (__v8sf) __W, + (__mmask8) __U, __R); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundpd_ps (__mmask8 __U, __m512d __A, const int __R) +{ + return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsd_ss (__m128 __A, __m128d __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtsd2ss_round ((__v4sf) __A, + (__v2df) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvt_roundsd_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128d __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtsd2ss_mask_round ((__v4sf) __A, + (__v2df) __B, + (__v4sf) __W, + __U, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvt_roundsd_ss (__mmask8 __U, __m128 __A, + __m128d __B, const int __R) +{ + return (__m128) __builtin_ia32_cvtsd2ss_mask_round ((__v4sf) __A, + (__v2df) __B, + _mm_setzero_ps (), + __U, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundss_sd (__m128d __A, __m128 __B, const int __R) +{ + return (__m128d) __builtin_ia32_cvtss2sd_round ((__v2df) __A, + (__v4sf) __B, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvt_roundss_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128 __B, const int __R) +{ + return (__m128d) __builtin_ia32_cvtss2sd_mask_round ((__v2df) __A, + (__v4sf) __B, + (__v2df) __W, + __U, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvt_roundss_sd (__mmask8 __U, __m128d __A, + __m128 __B, const int __R) +{ + return (__m128d) __builtin_ia32_cvtss2sd_mask_round ((__v2df) __A, + (__v4sf) __B, + _mm_setzero_pd (), + __U, + __R); +} +#else +#define _mm512_cvt_roundpd_ps(A, B) \ + (__m256)__builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)_mm256_undefined_ps(), -1, B) + +#define _mm512_mask_cvt_roundpd_ps(W, U, A, B) \ + (__m256)__builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)(W), U, B) + +#define _mm512_maskz_cvt_roundpd_ps(U, A, B) \ + (__m256)__builtin_ia32_cvtpd2ps512_mask(A, (__v8sf)_mm256_setzero_ps(), U, B) + +#define _mm_cvt_roundsd_ss(A, B, C) \ + (__m128)__builtin_ia32_cvtsd2ss_round(A, B, C) + +#define _mm_mask_cvt_roundsd_ss(W, U, A, B, C) \ + (__m128)__builtin_ia32_cvtsd2ss_mask_round ((A), (B), (W), (U), (C)) + +#define _mm_maskz_cvt_roundsd_ss(U, A, B, C) \ + (__m128)__builtin_ia32_cvtsd2ss_mask_round ((A), (B), _mm_setzero_ps (), \ + (U), (C)) + +#define _mm_cvt_roundss_sd(A, B, C) \ + (__m128d)__builtin_ia32_cvtss2sd_round(A, B, C) + +#define _mm_mask_cvt_roundss_sd(W, U, A, B, C) \ + (__m128d)__builtin_ia32_cvtss2sd_mask_round ((A), (B), (W), (U), (C)) + +#define _mm_maskz_cvt_roundss_sd(U, A, B, C) \ + (__m128d)__builtin_ia32_cvtss2sd_mask_round ((A), (B), _mm_setzero_pd (), \ + (U), (C)) + +#endif + +#define _mm_mask_cvtss_sd(W, U, A, B) \ + _mm_mask_cvt_roundss_sd ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_cvtss_sd(U, A, B) \ + _mm_maskz_cvt_roundss_sd ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_cvtsd_ss(W, U, A, B) \ + _mm_mask_cvt_roundsd_ss ((W), (U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_cvtsd_ss(U, A, B) \ + _mm_maskz_cvt_roundsd_ss ((U), (A), (B), _MM_FROUND_CUR_DIRECTION) + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_stream_si512 (__m512i * __P, __m512i __A) +{ + __builtin_ia32_movntdq512 ((__v8di *) __P, (__v8di) __A); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_stream_ps (float *__P, __m512 __A) +{ + __builtin_ia32_movntps512 (__P, (__v16sf) __A); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_stream_pd (double *__P, __m512d __A) +{ + __builtin_ia32_movntpd512 (__P, (__v8df) __A); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_stream_load_si512 (void *__P) +{ + return __builtin_ia32_movntdqa512 ((__v8di *)__P); +} + +/* Constants for mantissa extraction */ +typedef enum +{ + _MM_MANT_NORM_1_2, /* interval [1, 2) */ + _MM_MANT_NORM_p5_2, /* interval [0.5, 2) */ + _MM_MANT_NORM_p5_1, /* interval [0.5, 1) */ + _MM_MANT_NORM_p75_1p5 /* interval [0.75, 1.5) */ +} _MM_MANTISSA_NORM_ENUM; + +typedef enum +{ + _MM_MANT_SIGN_src, /* sign = sign(SRC) */ + _MM_MANT_SIGN_zero, /* sign = 0 */ + _MM_MANT_SIGN_nan /* DEST = NaN if sign(SRC) = 1 */ +} _MM_MANTISSA_SIGN_ENUM; + +#ifdef __OPTIMIZE__ +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getexp_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_getexpss128_round ((__v4sf) __A, + (__v4sf) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getexp_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_getexpss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getexp_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_getexpss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getexp_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_getexpsd128_round ((__v2df) __A, + (__v2df) __B, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getexp_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_getexpsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getexp_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_getexpsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getexp_round_ps (__m512 __A, const int __R) +{ + return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getexp_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + const int __R) +{ + return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getexp_round_ps (__mmask16 __U, __m512 __A, const int __R) +{ + return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getexp_round_pd (__m512d __A, const int __R) +{ + return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getexp_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + const int __R) +{ + return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getexp_round_pd (__mmask8 __U, __m512d __A, const int __R) +{ + return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getmant_round_pd (__m512d __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) +{ + return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A, + (__C << 2) | __B, + _mm512_undefined_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getmant_round_pd (__m512d __W, __mmask8 __U, __m512d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) +{ + return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A, + (__C << 2) | __B, + (__v8df) __W, __U, + __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getmant_round_pd (__mmask8 __U, __m512d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) +{ + return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A, + (__C << 2) | __B, + (__v8df) + _mm512_setzero_pd (), + __U, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getmant_round_ps (__m512 __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) +{ + return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A, + (__C << 2) | __B, + _mm512_undefined_ps (), + (__mmask16) -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getmant_round_ps (__m512 __W, __mmask16 __U, __m512 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) +{ + return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A, + (__C << 2) | __B, + (__v16sf) __W, __U, + __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getmant_round_ps (__mmask16 __U, __m512 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) +{ + return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A, + (__C << 2) | __B, + (__v16sf) + _mm512_setzero_ps (), + __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getmant_round_sd (__m128d __A, __m128d __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) +{ + return (__m128d) __builtin_ia32_getmantsd_round ((__v2df) __A, + (__v2df) __B, + (__D << 2) | __C, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getmant_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) +{ + return (__m128d) __builtin_ia32_getmantsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__D << 2) | __C, + (__v2df) __W, + __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getmant_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) +{ + return (__m128d) __builtin_ia32_getmantsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__D << 2) | __C, + (__v2df) + _mm_setzero_pd(), + __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getmant_round_ss (__m128 __A, __m128 __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) +{ + return (__m128) __builtin_ia32_getmantss_round ((__v4sf) __A, + (__v4sf) __B, + (__D << 2) | __C, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getmant_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) +{ + return (__m128) __builtin_ia32_getmantss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__D << 2) | __C, + (__v4sf) __W, + __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getmant_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) +{ + return (__m128) __builtin_ia32_getmantss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__D << 2) | __C, + (__v4sf) + _mm_setzero_ps(), + __U, __R); +} + +#else +#define _mm512_getmant_round_pd(X, B, C, R) \ + ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X), \ + (int)(((C)<<2) | (B)), \ + (__v8df)(__m512d)_mm512_undefined_pd(), \ + (__mmask8)-1,\ + (R))) + +#define _mm512_mask_getmant_round_pd(W, U, X, B, C, R) \ + ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X), \ + (int)(((C)<<2) | (B)), \ + (__v8df)(__m512d)(W), \ + (__mmask8)(U),\ + (R))) + +#define _mm512_maskz_getmant_round_pd(U, X, B, C, R) \ + ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X), \ + (int)(((C)<<2) | (B)), \ + (__v8df)(__m512d)_mm512_setzero_pd(), \ + (__mmask8)(U),\ + (R))) +#define _mm512_getmant_round_ps(X, B, C, R) \ + ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X), \ + (int)(((C)<<2) | (B)), \ + (__v16sf)(__m512)_mm512_undefined_ps(), \ + (__mmask16)-1,\ + (R))) + +#define _mm512_mask_getmant_round_ps(W, U, X, B, C, R) \ + ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X), \ + (int)(((C)<<2) | (B)), \ + (__v16sf)(__m512)(W), \ + (__mmask16)(U),\ + (R))) + +#define _mm512_maskz_getmant_round_ps(U, X, B, C, R) \ + ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X), \ + (int)(((C)<<2) | (B)), \ + (__v16sf)(__m512)_mm512_setzero_ps(), \ + (__mmask16)(U),\ + (R))) +#define _mm_getmant_round_sd(X, Y, C, D, R) \ + ((__m128d)__builtin_ia32_getmantsd_round ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), \ + (int)(((D)<<2) | (C)), \ + (R))) + +#define _mm_mask_getmant_round_sd(W, U, X, Y, C, D, R) \ + ((__m128d)__builtin_ia32_getmantsd_mask_round ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), \ + (int)(((D)<<2) | (C)), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U),\ + (R))) + +#define _mm_maskz_getmant_round_sd(U, X, Y, C, D, R) \ + ((__m128d)__builtin_ia32_getmantsd_mask_round ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), \ + (int)(((D)<<2) | (C)), \ + (__v2df)(__m128d)_mm_setzero_pd(), \ + (__mmask8)(U),\ + (R))) + +#define _mm_getmant_round_ss(X, Y, C, D, R) \ + ((__m128)__builtin_ia32_getmantss_round ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), \ + (int)(((D)<<2) | (C)), \ + (R))) + +#define _mm_mask_getmant_round_ss(W, U, X, Y, C, D, R) \ + ((__m128)__builtin_ia32_getmantss_mask_round ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), \ + (int)(((D)<<2) | (C)), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U),\ + (R))) + +#define _mm_maskz_getmant_round_ss(U, X, Y, C, D, R) \ + ((__m128)__builtin_ia32_getmantss_mask_round ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), \ + (int)(((D)<<2) | (C)), \ + (__v4sf)(__m128)_mm_setzero_ps(), \ + (__mmask8)(U),\ + (R))) + +#define _mm_getexp_round_ss(A, B, R) \ + ((__m128)__builtin_ia32_getexpss128_round((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), R)) + +#define _mm_mask_getexp_round_ss(W, U, A, B, C) \ + (__m128)__builtin_ia32_getexpss_mask_round(A, B, W, U, C) + +#define _mm_maskz_getexp_round_ss(U, A, B, C) \ + (__m128)__builtin_ia32_getexpss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) + +#define _mm_getexp_round_sd(A, B, R) \ + ((__m128d)__builtin_ia32_getexpsd128_round((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), R)) + +#define _mm_mask_getexp_round_sd(W, U, A, B, C) \ + (__m128d)__builtin_ia32_getexpsd_mask_round(A, B, W, U, C) + +#define _mm_maskz_getexp_round_sd(U, A, B, C) \ + (__m128d)__builtin_ia32_getexpsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) + + +#define _mm512_getexp_round_ps(A, R) \ + ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, R)) + +#define _mm512_mask_getexp_round_ps(W, U, A, R) \ + ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(W), (__mmask16)(U), R)) + +#define _mm512_maskz_getexp_round_ps(U, A, R) \ + ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), R)) + +#define _mm512_getexp_round_pd(A, R) \ + ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_undefined_pd(), (__mmask8)-1, R)) + +#define _mm512_mask_getexp_round_pd(W, U, A, R) \ + ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(W), (__mmask8)(U), R)) + +#define _mm512_maskz_getexp_round_pd(U, A, R) \ + ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_setzero_pd(), (__mmask8)(U), R)) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_roundscale_round_ps (__m512 __A, const int __imm, const int __R) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, __imm, + (__v16sf) + _mm512_undefined_ps (), + -1, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_roundscale_round_ps (__m512 __A, __mmask16 __B, __m512 __C, + const int __imm, const int __R) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __C, __imm, + (__v16sf) __A, + (__mmask16) __B, __R); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_roundscale_round_ps (__mmask16 __A, __m512 __B, + const int __imm, const int __R) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __B, + __imm, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __A, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_roundscale_round_pd (__m512d __A, const int __imm, const int __R) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, __imm, + (__v8df) + _mm512_undefined_pd (), + -1, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_roundscale_round_pd (__m512d __A, __mmask8 __B, + __m512d __C, const int __imm, const int __R) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __C, __imm, + (__v8df) __A, + (__mmask8) __B, __R); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_roundscale_round_pd (__mmask8 __A, __m512d __B, + const int __imm, const int __R) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __B, + __imm, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __A, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roundscale_round_ss (__m128 __A, __m128 __B, const int __imm, + const int __R) +{ + return (__m128) + __builtin_ia32_rndscaless_mask_round ((__v4sf) __A, + (__v4sf) __B, __imm, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_roundscale_round_ss (__m128 __A, __mmask8 __B, __m128 __C, + __m128 __D, const int __imm, const int __R) +{ + return (__m128) + __builtin_ia32_rndscaless_mask_round ((__v4sf) __C, + (__v4sf) __D, __imm, + (__v4sf) __A, + (__mmask8) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_roundscale_round_ss (__mmask8 __A, __m128 __B, __m128 __C, + const int __imm, const int __R) +{ + return (__m128) + __builtin_ia32_rndscaless_mask_round ((__v4sf) __B, + (__v4sf) __C, __imm, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __A, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roundscale_round_sd (__m128d __A, __m128d __B, const int __imm, + const int __R) +{ + return (__m128d) + __builtin_ia32_rndscalesd_mask_round ((__v2df) __A, + (__v2df) __B, __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_roundscale_round_sd (__m128d __A, __mmask8 __B, __m128d __C, + __m128d __D, const int __imm, const int __R) +{ + return (__m128d) + __builtin_ia32_rndscalesd_mask_round ((__v2df) __C, + (__v2df) __D, __imm, + (__v2df) __A, + (__mmask8) __B, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_roundscale_round_sd (__mmask8 __A, __m128d __B, __m128d __C, + const int __imm, const int __R) +{ + return (__m128d) + __builtin_ia32_rndscalesd_mask_round ((__v2df) __B, + (__v2df) __C, __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __A, + __R); +} + +#else +#define _mm512_roundscale_round_ps(A, B, R) \ + ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(A), (int)(B),\ + (__v16sf)_mm512_undefined_ps(), (__mmask16)(-1), R)) +#define _mm512_mask_roundscale_round_ps(A, B, C, D, R) \ + ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(C), \ + (int)(D), \ + (__v16sf)(__m512)(A), \ + (__mmask16)(B), R)) +#define _mm512_maskz_roundscale_round_ps(A, B, C, R) \ + ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(B), \ + (int)(C), \ + (__v16sf)_mm512_setzero_ps(),\ + (__mmask16)(A), R)) +#define _mm512_roundscale_round_pd(A, B, R) \ + ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(A), (int)(B),\ + (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), R)) +#define _mm512_mask_roundscale_round_pd(A, B, C, D, R) \ + ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(C), \ + (int)(D), \ + (__v8df)(__m512d)(A), \ + (__mmask8)(B), R)) +#define _mm512_maskz_roundscale_round_pd(A, B, C, R) \ + ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(B), \ + (int)(C), \ + (__v8df)_mm512_setzero_pd(),\ + (__mmask8)(A), R)) +#define _mm_roundscale_round_ss(A, B, I, R) \ + ((__m128) \ + __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (A), \ + (__v4sf) (__m128) (B), \ + (int) (I), \ + (__v4sf) _mm_setzero_ps (), \ + (__mmask8) (-1), \ + (int) (R))) +#define _mm_mask_roundscale_round_ss(A, U, B, C, I, R) \ + ((__m128) \ + __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (B), \ + (__v4sf) (__m128) (C), \ + (int) (I), \ + (__v4sf) (__m128) (A), \ + (__mmask8) (U), \ + (int) (R))) +#define _mm_maskz_roundscale_round_ss(U, A, B, I, R) \ + ((__m128) \ + __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (A), \ + (__v4sf) (__m128) (B), \ + (int) (I), \ + (__v4sf) _mm_setzero_ps (), \ + (__mmask8) (U), \ + (int) (R))) +#define _mm_roundscale_round_sd(A, B, I, R) \ + ((__m128d) \ + __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (A), \ + (__v2df) (__m128d) (B), \ + (int) (I), \ + (__v2df) _mm_setzero_pd (), \ + (__mmask8) (-1), \ + (int) (R))) +#define _mm_mask_roundscale_round_sd(A, U, B, C, I, R) \ + ((__m128d) \ + __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (B), \ + (__v2df) (__m128d) (C), \ + (int) (I), \ + (__v2df) (__m128d) (A), \ + (__mmask8) (U), \ + (int) (R))) +#define _mm_maskz_roundscale_round_sd(U, A, B, I, R) \ + ((__m128d) \ + __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (A), \ + (__v2df) (__m128d) (B), \ + (int) (I), \ + (__v2df) _mm_setzero_pd (), \ + (__mmask8) (U), \ + (int) (R))) +#endif + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_floor_ps (__m512 __A) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, + _MM_FROUND_FLOOR, + (__v16sf) __A, -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_floor_pd (__m512d __A) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, + _MM_FROUND_FLOOR, + (__v8df) __A, -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_ceil_ps (__m512 __A) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, + _MM_FROUND_CEIL, + (__v16sf) __A, -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_ceil_pd (__m512d __A) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, + _MM_FROUND_CEIL, + (__v8df) __A, -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, + _MM_FROUND_FLOOR, + (__v16sf) __W, __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, + _MM_FROUND_FLOOR, + (__v8df) __W, __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, + _MM_FROUND_CEIL, + (__v16sf) __W, __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, + _MM_FROUND_CEIL, + (__v8df) __W, __U, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_alignr_epi32 (__m512i __A, __m512i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_alignd512_mask ((__v16si) __A, + (__v16si) __B, __imm, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_alignr_epi32 (__m512i __W, __mmask16 __U, __m512i __A, + __m512i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_alignd512_mask ((__v16si) __A, + (__v16si) __B, __imm, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_alignr_epi32 (__mmask16 __U, __m512i __A, __m512i __B, + const int __imm) +{ + return (__m512i) __builtin_ia32_alignd512_mask ((__v16si) __A, + (__v16si) __B, __imm, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_alignr_epi64 (__m512i __A, __m512i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_alignq512_mask ((__v8di) __A, + (__v8di) __B, __imm, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_alignr_epi64 (__m512i __W, __mmask8 __U, __m512i __A, + __m512i __B, const int __imm) +{ + return (__m512i) __builtin_ia32_alignq512_mask ((__v8di) __A, + (__v8di) __B, __imm, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_alignr_epi64 (__mmask8 __U, __m512i __A, __m512i __B, + const int __imm) +{ + return (__m512i) __builtin_ia32_alignq512_mask ((__v8di) __A, + (__v8di) __B, __imm, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} +#else +#define _mm512_alignr_epi32(X, Y, C) \ + ((__m512i)__builtin_ia32_alignd512_mask ((__v16si)(__m512i)(X), \ + (__v16si)(__m512i)(Y), (int)(C), (__v16si)_mm512_undefined_epi32 (),\ + (__mmask16)-1)) + +#define _mm512_mask_alignr_epi32(W, U, X, Y, C) \ + ((__m512i)__builtin_ia32_alignd512_mask ((__v16si)(__m512i)(X), \ + (__v16si)(__m512i)(Y), (int)(C), (__v16si)(__m512i)(W), \ + (__mmask16)(U))) + +#define _mm512_maskz_alignr_epi32(U, X, Y, C) \ + ((__m512i)__builtin_ia32_alignd512_mask ((__v16si)(__m512i)(X), \ + (__v16si)(__m512i)(Y), (int)(C), (__v16si)_mm512_setzero_si512 (),\ + (__mmask16)(U))) + +#define _mm512_alignr_epi64(X, Y, C) \ + ((__m512i)__builtin_ia32_alignq512_mask ((__v8di)(__m512i)(X), \ + (__v8di)(__m512i)(Y), (int)(C), (__v8di)_mm512_undefined_epi32 (), \ + (__mmask8)-1)) + +#define _mm512_mask_alignr_epi64(W, U, X, Y, C) \ + ((__m512i)__builtin_ia32_alignq512_mask ((__v8di)(__m512i)(X), \ + (__v8di)(__m512i)(Y), (int)(C), (__v8di)(__m512i)(W), (__mmask8)(U))) + +#define _mm512_maskz_alignr_epi64(U, X, Y, C) \ + ((__m512i)__builtin_ia32_alignq512_mask ((__v8di)(__m512i)(X), \ + (__v8di)(__m512i)(Y), (int)(C), (__v8di)_mm512_setzero_si512 (),\ + (__mmask8)(U))) +#endif + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpeq_epi32_mask (__m512i __A, __m512i __B) +{ + return (__mmask16) __builtin_ia32_pcmpeqd512_mask ((__v16si) __A, + (__v16si) __B, + (__mmask16) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpeq_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__mmask16) __builtin_ia32_pcmpeqd512_mask ((__v16si) __A, + (__v16si) __B, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpeq_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqq512_mask ((__v8di) __A, + (__v8di) __B, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpeq_epi64_mask (__m512i __A, __m512i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqq512_mask ((__v8di) __A, + (__v8di) __B, + (__mmask8) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpgt_epi32_mask (__m512i __A, __m512i __B) +{ + return (__mmask16) __builtin_ia32_pcmpgtd512_mask ((__v16si) __A, + (__v16si) __B, + (__mmask16) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpgt_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__mmask16) __builtin_ia32_pcmpgtd512_mask ((__v16si) __A, + (__v16si) __B, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpgt_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtq512_mask ((__v8di) __A, + (__v8di) __B, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpgt_epi64_mask (__m512i __A, __m512i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtq512_mask ((__v8di) __A, + (__v8di) __B, + (__mmask8) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpge_epi32_mask (__m512i __X, __m512i __Y) +{ + return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, + (__v16si) __Y, 5, + (__mmask16) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpge_epi32_mask (__mmask16 __M, __m512i __X, __m512i __Y) +{ + return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, + (__v16si) __Y, 5, + (__mmask16) __M); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpge_epu32_mask (__mmask16 __M, __m512i __X, __m512i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, + (__v16si) __Y, 5, + (__mmask16) __M); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpge_epu32_mask (__m512i __X, __m512i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, + (__v16si) __Y, 5, + (__mmask16) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpge_epi64_mask (__mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, + (__v8di) __Y, 5, + (__mmask8) __M); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpge_epi64_mask (__m512i __X, __m512i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, + (__v8di) __Y, 5, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpge_epu64_mask (__mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, + (__v8di) __Y, 5, + (__mmask8) __M); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpge_epu64_mask (__m512i __X, __m512i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, + (__v8di) __Y, 5, + (__mmask8) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmple_epi32_mask (__mmask16 __M, __m512i __X, __m512i __Y) +{ + return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, + (__v16si) __Y, 2, + (__mmask16) __M); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmple_epi32_mask (__m512i __X, __m512i __Y) +{ + return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, + (__v16si) __Y, 2, + (__mmask16) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmple_epu32_mask (__mmask16 __M, __m512i __X, __m512i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, + (__v16si) __Y, 2, + (__mmask16) __M); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmple_epu32_mask (__m512i __X, __m512i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, + (__v16si) __Y, 2, + (__mmask16) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmple_epi64_mask (__mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, + (__v8di) __Y, 2, + (__mmask8) __M); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmple_epi64_mask (__m512i __X, __m512i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, + (__v8di) __Y, 2, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmple_epu64_mask (__mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, + (__v8di) __Y, 2, + (__mmask8) __M); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmple_epu64_mask (__m512i __X, __m512i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, + (__v8di) __Y, 2, + (__mmask8) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmplt_epi32_mask (__mmask16 __M, __m512i __X, __m512i __Y) +{ + return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, + (__v16si) __Y, 1, + (__mmask16) __M); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmplt_epi32_mask (__m512i __X, __m512i __Y) +{ + return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, + (__v16si) __Y, 1, + (__mmask16) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmplt_epu32_mask (__mmask16 __M, __m512i __X, __m512i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, + (__v16si) __Y, 1, + (__mmask16) __M); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmplt_epu32_mask (__m512i __X, __m512i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, + (__v16si) __Y, 1, + (__mmask16) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmplt_epi64_mask (__mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, + (__v8di) __Y, 1, + (__mmask8) __M); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmplt_epi64_mask (__m512i __X, __m512i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, + (__v8di) __Y, 1, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmplt_epu64_mask (__mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, + (__v8di) __Y, 1, + (__mmask8) __M); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmplt_epu64_mask (__m512i __X, __m512i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, + (__v8di) __Y, 1, + (__mmask8) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpneq_epi32_mask (__m512i __X, __m512i __Y) +{ + return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, + (__v16si) __Y, 4, + (__mmask16) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpneq_epi32_mask (__mmask16 __M, __m512i __X, __m512i __Y) +{ + return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, + (__v16si) __Y, 4, + (__mmask16) __M); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpneq_epu32_mask (__mmask16 __M, __m512i __X, __m512i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, + (__v16si) __Y, 4, + (__mmask16) __M); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpneq_epu32_mask (__m512i __X, __m512i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, + (__v16si) __Y, 4, + (__mmask16) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpneq_epi64_mask (__mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, + (__v8di) __Y, 4, + (__mmask8) __M); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpneq_epi64_mask (__m512i __X, __m512i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, + (__v8di) __Y, 4, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpneq_epu64_mask (__mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, + (__v8di) __Y, 4, + (__mmask8) __M); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpneq_epu64_mask (__m512i __X, __m512i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, + (__v8di) __Y, 4, + (__mmask8) -1); +} + +#define _MM_CMPINT_EQ 0x0 +#define _MM_CMPINT_LT 0x1 +#define _MM_CMPINT_LE 0x2 +#define _MM_CMPINT_UNUSED 0x3 +#define _MM_CMPINT_NE 0x4 +#define _MM_CMPINT_NLT 0x5 +#define _MM_CMPINT_GE 0x5 +#define _MM_CMPINT_NLE 0x6 +#define _MM_CMPINT_GT 0x6 + +#ifdef __OPTIMIZE__ +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kshiftli_mask16 (__mmask16 __A, unsigned int __B) +{ + return (__mmask16) __builtin_ia32_kshiftlihi ((__mmask16) __A, + (__mmask8) __B); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kshiftri_mask16 (__mmask16 __A, unsigned int __B) +{ + return (__mmask16) __builtin_ia32_kshiftrihi ((__mmask16) __A, + (__mmask8) __B); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_epi64_mask (__m512i __X, __m512i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, + (__v8di) __Y, __P, + (__mmask8) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_epi32_mask (__m512i __X, __m512i __Y, const int __P) +{ + return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, + (__v16si) __Y, __P, + (__mmask16) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_epu64_mask (__m512i __X, __m512i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, + (__v8di) __Y, __P, + (__mmask8) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_epu32_mask (__m512i __X, __m512i __Y, const int __P) +{ + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, + (__v16si) __Y, __P, + (__mmask16) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_round_pd_mask (__m512d __X, __m512d __Y, const int __P, + const int __R) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, __P, + (__mmask8) -1, __R); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_round_ps_mask (__m512 __X, __m512 __Y, const int __P, const int __R) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, __P, + (__mmask16) -1, __R); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_epi64_mask (__mmask8 __U, __m512i __X, __m512i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_cmpq512_mask ((__v8di) __X, + (__v8di) __Y, __P, + (__mmask8) __U); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_epi32_mask (__mmask16 __U, __m512i __X, __m512i __Y, + const int __P) +{ + return (__mmask16) __builtin_ia32_cmpd512_mask ((__v16si) __X, + (__v16si) __Y, __P, + (__mmask16) __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_epu64_mask (__mmask8 __U, __m512i __X, __m512i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __X, + (__v8di) __Y, __P, + (__mmask8) __U); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_epu32_mask (__mmask16 __U, __m512i __X, __m512i __Y, + const int __P) +{ + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __X, + (__v16si) __Y, __P, + (__mmask16) __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_round_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y, + const int __P, const int __R) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, __P, + (__mmask8) __U, __R); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_round_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y, + const int __P, const int __R) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, __P, + (__mmask16) __U, __R); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_round_sd_mask (__m128d __X, __m128d __Y, const int __P, const int __R) +{ + return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X, + (__v2df) __Y, __P, + (__mmask8) -1, __R); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_round_sd_mask (__mmask8 __M, __m128d __X, __m128d __Y, + const int __P, const int __R) +{ + return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X, + (__v2df) __Y, __P, + (__mmask8) __M, __R); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_round_ss_mask (__m128 __X, __m128 __Y, const int __P, const int __R) +{ + return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X, + (__v4sf) __Y, __P, + (__mmask8) -1, __R); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_round_ss_mask (__mmask8 __M, __m128 __X, __m128 __Y, + const int __P, const int __R) +{ + return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X, + (__v4sf) __Y, __P, + (__mmask8) __M, __R); +} + +#else +#define _kshiftli_mask16(X, Y) \ + ((__mmask16) __builtin_ia32_kshiftlihi ((__mmask16)(X), (__mmask8)(Y))) + +#define _kshiftri_mask16(X, Y) \ + ((__mmask16) __builtin_ia32_kshiftrihi ((__mmask16)(X), (__mmask8)(Y))) + +#define _mm512_cmp_epi64_mask(X, Y, P) \ + ((__mmask8) __builtin_ia32_cmpq512_mask ((__v8di)(__m512i)(X), \ + (__v8di)(__m512i)(Y), (int)(P),\ + (__mmask8)-1)) + +#define _mm512_cmp_epi32_mask(X, Y, P) \ + ((__mmask16) __builtin_ia32_cmpd512_mask ((__v16si)(__m512i)(X), \ + (__v16si)(__m512i)(Y), (int)(P), \ + (__mmask16)-1)) + +#define _mm512_cmp_epu64_mask(X, Y, P) \ + ((__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di)(__m512i)(X), \ + (__v8di)(__m512i)(Y), (int)(P),\ + (__mmask8)-1)) + +#define _mm512_cmp_epu32_mask(X, Y, P) \ + ((__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si)(__m512i)(X), \ + (__v16si)(__m512i)(Y), (int)(P), \ + (__mmask16)-1)) + +#define _mm512_cmp_round_pd_mask(X, Y, P, R) \ + ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X), \ + (__v8df)(__m512d)(Y), (int)(P),\ + (__mmask8)-1, R)) + +#define _mm512_cmp_round_ps_mask(X, Y, P, R) \ + ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X), \ + (__v16sf)(__m512)(Y), (int)(P),\ + (__mmask16)-1, R)) + +#define _mm512_mask_cmp_epi64_mask(M, X, Y, P) \ + ((__mmask8) __builtin_ia32_cmpq512_mask ((__v8di)(__m512i)(X), \ + (__v8di)(__m512i)(Y), (int)(P),\ + (__mmask8)(M))) + +#define _mm512_mask_cmp_epi32_mask(M, X, Y, P) \ + ((__mmask16) __builtin_ia32_cmpd512_mask ((__v16si)(__m512i)(X), \ + (__v16si)(__m512i)(Y), (int)(P), \ + (__mmask16)(M))) + +#define _mm512_mask_cmp_epu64_mask(M, X, Y, P) \ + ((__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di)(__m512i)(X), \ + (__v8di)(__m512i)(Y), (int)(P),\ + (__mmask8)(M))) + +#define _mm512_mask_cmp_epu32_mask(M, X, Y, P) \ + ((__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si)(__m512i)(X), \ + (__v16si)(__m512i)(Y), (int)(P), \ + (__mmask16)(M))) + +#define _mm512_mask_cmp_round_pd_mask(M, X, Y, P, R) \ + ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X), \ + (__v8df)(__m512d)(Y), (int)(P),\ + (__mmask8)(M), R)) + +#define _mm512_mask_cmp_round_ps_mask(M, X, Y, P, R) \ + ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X), \ + (__v16sf)(__m512)(Y), (int)(P),\ + (__mmask16)(M), R)) + +#define _mm_cmp_round_sd_mask(X, Y, P, R) \ + ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (int)(P),\ + (__mmask8)-1, R)) + +#define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) \ + ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (int)(P),\ + (M), R)) + +#define _mm_cmp_round_ss_mask(X, Y, P, R) \ + ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (int)(P), \ + (__mmask8)-1, R)) + +#define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) \ + ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (int)(P), \ + (M), R)) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i32gather_ps (__m512i __index, void const *__addr, int __scale) +{ + __m512 __v1_old = _mm512_undefined_ps (); + __mmask16 __mask = 0xFFFF; + + return (__m512) __builtin_ia32_gathersiv16sf ((__v16sf) __v1_old, + __addr, + (__v16si) __index, + __mask, __scale); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i32gather_ps (__m512 __v1_old, __mmask16 __mask, + __m512i __index, void const *__addr, int __scale) +{ + return (__m512) __builtin_ia32_gathersiv16sf ((__v16sf) __v1_old, + __addr, + (__v16si) __index, + __mask, __scale); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i32gather_pd (__m256i __index, void const *__addr, int __scale) +{ + __m512d __v1_old = _mm512_undefined_pd (); + __mmask8 __mask = 0xFF; + + return (__m512d) __builtin_ia32_gathersiv8df ((__v8df) __v1_old, + __addr, + (__v8si) __index, __mask, + __scale); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i32gather_pd (__m512d __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, int __scale) +{ + return (__m512d) __builtin_ia32_gathersiv8df ((__v8df) __v1_old, + __addr, + (__v8si) __index, + __mask, __scale); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i64gather_ps (__m512i __index, void const *__addr, int __scale) +{ + __m256 __v1_old = _mm256_undefined_ps (); + __mmask8 __mask = 0xFF; + + return (__m256) __builtin_ia32_gatherdiv16sf ((__v8sf) __v1_old, + __addr, + (__v8di) __index, __mask, + __scale); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i64gather_ps (__m256 __v1_old, __mmask8 __mask, + __m512i __index, void const *__addr, int __scale) +{ + return (__m256) __builtin_ia32_gatherdiv16sf ((__v8sf) __v1_old, + __addr, + (__v8di) __index, + __mask, __scale); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i64gather_pd (__m512i __index, void const *__addr, int __scale) +{ + __m512d __v1_old = _mm512_undefined_pd (); + __mmask8 __mask = 0xFF; + + return (__m512d) __builtin_ia32_gatherdiv8df ((__v8df) __v1_old, + __addr, + (__v8di) __index, __mask, + __scale); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i64gather_pd (__m512d __v1_old, __mmask8 __mask, + __m512i __index, void const *__addr, int __scale) +{ + return (__m512d) __builtin_ia32_gatherdiv8df ((__v8df) __v1_old, + __addr, + (__v8di) __index, + __mask, __scale); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i32gather_epi32 (__m512i __index, void const *__addr, int __scale) +{ + __m512i __v1_old = _mm512_undefined_epi32 (); + __mmask16 __mask = 0xFFFF; + + return (__m512i) __builtin_ia32_gathersiv16si ((__v16si) __v1_old, + __addr, + (__v16si) __index, + __mask, __scale); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i32gather_epi32 (__m512i __v1_old, __mmask16 __mask, + __m512i __index, void const *__addr, int __scale) +{ + return (__m512i) __builtin_ia32_gathersiv16si ((__v16si) __v1_old, + __addr, + (__v16si) __index, + __mask, __scale); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i32gather_epi64 (__m256i __index, void const *__addr, int __scale) +{ + __m512i __v1_old = _mm512_undefined_epi32 (); + __mmask8 __mask = 0xFF; + + return (__m512i) __builtin_ia32_gathersiv8di ((__v8di) __v1_old, + __addr, + (__v8si) __index, __mask, + __scale); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i32gather_epi64 (__m512i __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, + int __scale) +{ + return (__m512i) __builtin_ia32_gathersiv8di ((__v8di) __v1_old, + __addr, + (__v8si) __index, + __mask, __scale); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i64gather_epi32 (__m512i __index, void const *__addr, int __scale) +{ + __m256i __v1_old = _mm256_undefined_si256 (); + __mmask8 __mask = 0xFF; + + return (__m256i) __builtin_ia32_gatherdiv16si ((__v8si) __v1_old, + __addr, + (__v8di) __index, + __mask, __scale); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i64gather_epi32 (__m256i __v1_old, __mmask8 __mask, + __m512i __index, void const *__addr, int __scale) +{ + return (__m256i) __builtin_ia32_gatherdiv16si ((__v8si) __v1_old, + __addr, + (__v8di) __index, + __mask, __scale); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i64gather_epi64 (__m512i __index, void const *__addr, int __scale) +{ + __m512i __v1_old = _mm512_undefined_epi32 (); + __mmask8 __mask = 0xFF; + + return (__m512i) __builtin_ia32_gatherdiv8di ((__v8di) __v1_old, + __addr, + (__v8di) __index, __mask, + __scale); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i64gather_epi64 (__m512i __v1_old, __mmask8 __mask, + __m512i __index, void const *__addr, + int __scale) +{ + return (__m512i) __builtin_ia32_gatherdiv8di ((__v8di) __v1_old, + __addr, + (__v8di) __index, + __mask, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i32scatter_ps (void *__addr, __m512i __index, __m512 __v1, int __scale) +{ + __builtin_ia32_scattersiv16sf (__addr, (__mmask16) 0xFFFF, + (__v16si) __index, (__v16sf) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i32scatter_ps (void *__addr, __mmask16 __mask, + __m512i __index, __m512 __v1, int __scale) +{ + __builtin_ia32_scattersiv16sf (__addr, __mask, (__v16si) __index, + (__v16sf) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i32scatter_pd (void *__addr, __m256i __index, __m512d __v1, + int __scale) +{ + __builtin_ia32_scattersiv8df (__addr, (__mmask8) 0xFF, + (__v8si) __index, (__v8df) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i32scatter_pd (void *__addr, __mmask8 __mask, + __m256i __index, __m512d __v1, int __scale) +{ + __builtin_ia32_scattersiv8df (__addr, __mask, (__v8si) __index, + (__v8df) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i64scatter_ps (void *__addr, __m512i __index, __m256 __v1, int __scale) +{ + __builtin_ia32_scatterdiv16sf (__addr, (__mmask8) 0xFF, + (__v8di) __index, (__v8sf) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i64scatter_ps (void *__addr, __mmask8 __mask, + __m512i __index, __m256 __v1, int __scale) +{ + __builtin_ia32_scatterdiv16sf (__addr, __mask, (__v8di) __index, + (__v8sf) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i64scatter_pd (void *__addr, __m512i __index, __m512d __v1, + int __scale) +{ + __builtin_ia32_scatterdiv8df (__addr, (__mmask8) 0xFF, + (__v8di) __index, (__v8df) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i64scatter_pd (void *__addr, __mmask8 __mask, + __m512i __index, __m512d __v1, int __scale) +{ + __builtin_ia32_scatterdiv8df (__addr, __mask, (__v8di) __index, + (__v8df) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i32scatter_epi32 (void *__addr, __m512i __index, + __m512i __v1, int __scale) +{ + __builtin_ia32_scattersiv16si (__addr, (__mmask16) 0xFFFF, + (__v16si) __index, (__v16si) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i32scatter_epi32 (void *__addr, __mmask16 __mask, + __m512i __index, __m512i __v1, int __scale) +{ + __builtin_ia32_scattersiv16si (__addr, __mask, (__v16si) __index, + (__v16si) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i32scatter_epi64 (void *__addr, __m256i __index, + __m512i __v1, int __scale) +{ + __builtin_ia32_scattersiv8di (__addr, (__mmask8) 0xFF, + (__v8si) __index, (__v8di) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i32scatter_epi64 (void *__addr, __mmask8 __mask, + __m256i __index, __m512i __v1, int __scale) +{ + __builtin_ia32_scattersiv8di (__addr, __mask, (__v8si) __index, + (__v8di) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i64scatter_epi32 (void *__addr, __m512i __index, + __m256i __v1, int __scale) +{ + __builtin_ia32_scatterdiv16si (__addr, (__mmask8) 0xFF, + (__v8di) __index, (__v8si) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i64scatter_epi32 (void *__addr, __mmask8 __mask, + __m512i __index, __m256i __v1, int __scale) +{ + __builtin_ia32_scatterdiv16si (__addr, __mask, (__v8di) __index, + (__v8si) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_i64scatter_epi64 (void *__addr, __m512i __index, + __m512i __v1, int __scale) +{ + __builtin_ia32_scatterdiv8di (__addr, (__mmask8) 0xFF, + (__v8di) __index, (__v8di) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_i64scatter_epi64 (void *__addr, __mmask8 __mask, + __m512i __index, __m512i __v1, int __scale) +{ + __builtin_ia32_scatterdiv8di (__addr, __mask, (__v8di) __index, + (__v8di) __v1, __scale); +} +#else +#define _mm512_i32gather_ps(INDEX, ADDR, SCALE) \ + (__m512) __builtin_ia32_gathersiv16sf ((__v16sf)_mm512_undefined_ps(),\ + (void const *) (ADDR), \ + (__v16si)(__m512i) (INDEX), \ + (__mmask16)0xFFFF, \ + (int) (SCALE)) + +#define _mm512_mask_i32gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m512) __builtin_ia32_gathersiv16sf ((__v16sf)(__m512) (V1OLD), \ + (void const *) (ADDR), \ + (__v16si)(__m512i) (INDEX), \ + (__mmask16) (MASK), \ + (int) (SCALE)) + +#define _mm512_i32gather_pd(INDEX, ADDR, SCALE) \ + (__m512d) __builtin_ia32_gathersiv8df ((__v8df)_mm512_undefined_pd(), \ + (void const *) (ADDR), \ + (__v8si)(__m256i) (INDEX), \ + (__mmask8)0xFF, (int) (SCALE)) + +#define _mm512_mask_i32gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m512d) __builtin_ia32_gathersiv8df ((__v8df)(__m512d) (V1OLD), \ + (void const *) (ADDR), \ + (__v8si)(__m256i) (INDEX), \ + (__mmask8) (MASK), \ + (int) (SCALE)) + +#define _mm512_i64gather_ps(INDEX, ADDR, SCALE) \ + (__m256) __builtin_ia32_gatherdiv16sf ((__v8sf)_mm256_undefined_ps(), \ + (void const *) (ADDR), \ + (__v8di)(__m512i) (INDEX), \ + (__mmask8)0xFF, (int) (SCALE)) + +#define _mm512_mask_i64gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m256) __builtin_ia32_gatherdiv16sf ((__v8sf)(__m256) (V1OLD), \ + (void const *) (ADDR), \ + (__v8di)(__m512i) (INDEX), \ + (__mmask8) (MASK), \ + (int) (SCALE)) + +#define _mm512_i64gather_pd(INDEX, ADDR, SCALE) \ + (__m512d) __builtin_ia32_gatherdiv8df ((__v8df)_mm512_undefined_pd(), \ + (void const *) (ADDR), \ + (__v8di)(__m512i) (INDEX), \ + (__mmask8)0xFF, (int) (SCALE)) + +#define _mm512_mask_i64gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m512d) __builtin_ia32_gatherdiv8df ((__v8df)(__m512d) (V1OLD), \ + (void const *) (ADDR), \ + (__v8di)(__m512i) (INDEX), \ + (__mmask8) (MASK), \ + (int) (SCALE)) + +#define _mm512_i32gather_epi32(INDEX, ADDR, SCALE) \ + (__m512i) __builtin_ia32_gathersiv16si ((__v16si)_mm512_undefined_epi32 (),\ + (void const *) (ADDR), \ + (__v16si)(__m512i) (INDEX), \ + (__mmask16)0xFFFF, \ + (int) (SCALE)) + +#define _mm512_mask_i32gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m512i) __builtin_ia32_gathersiv16si ((__v16si)(__m512i) (V1OLD), \ + (void const *) (ADDR), \ + (__v16si)(__m512i) (INDEX), \ + (__mmask16) (MASK), \ + (int) (SCALE)) + +#define _mm512_i32gather_epi64(INDEX, ADDR, SCALE) \ + (__m512i) __builtin_ia32_gathersiv8di ((__v8di)_mm512_undefined_epi32 (),\ + (void const *) (ADDR), \ + (__v8si)(__m256i) (INDEX), \ + (__mmask8)0xFF, (int) (SCALE)) + +#define _mm512_mask_i32gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m512i) __builtin_ia32_gathersiv8di ((__v8di)(__m512i) (V1OLD), \ + (void const *) (ADDR), \ + (__v8si)(__m256i) (INDEX), \ + (__mmask8) (MASK), \ + (int) (SCALE)) + +#define _mm512_i64gather_epi32(INDEX, ADDR, SCALE) \ + (__m256i) __builtin_ia32_gatherdiv16si ((__v8si)_mm256_undefined_si256(),\ + (void const *) (ADDR), \ + (__v8di)(__m512i) (INDEX), \ + (__mmask8)0xFF, (int) (SCALE)) + +#define _mm512_mask_i64gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m256i) __builtin_ia32_gatherdiv16si ((__v8si)(__m256i) (V1OLD), \ + (void const *) (ADDR), \ + (__v8di)(__m512i) (INDEX), \ + (__mmask8) (MASK), \ + (int) (SCALE)) + +#define _mm512_i64gather_epi64(INDEX, ADDR, SCALE) \ + (__m512i) __builtin_ia32_gatherdiv8di ((__v8di)_mm512_undefined_epi32 (),\ + (void const *) (ADDR), \ + (__v8di)(__m512i) (INDEX), \ + (__mmask8)0xFF, (int) (SCALE)) + +#define _mm512_mask_i64gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m512i) __builtin_ia32_gatherdiv8di ((__v8di)(__m512i) (V1OLD), \ + (void const *) (ADDR), \ + (__v8di)(__m512i) (INDEX), \ + (__mmask8) (MASK), \ + (int) (SCALE)) + +#define _mm512_i32scatter_ps(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv16sf ((void *) (ADDR), (__mmask16)0xFFFF, \ + (__v16si)(__m512i) (INDEX), \ + (__v16sf)(__m512) (V1), (int) (SCALE)) + +#define _mm512_mask_i32scatter_ps(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv16sf ((void *) (ADDR), (__mmask16) (MASK), \ + (__v16si)(__m512i) (INDEX), \ + (__v16sf)(__m512) (V1), (int) (SCALE)) + +#define _mm512_i32scatter_pd(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv8df ((void *) (ADDR), (__mmask8)0xFF, \ + (__v8si)(__m256i) (INDEX), \ + (__v8df)(__m512d) (V1), (int) (SCALE)) + +#define _mm512_mask_i32scatter_pd(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv8df ((void *) (ADDR), (__mmask8) (MASK), \ + (__v8si)(__m256i) (INDEX), \ + (__v8df)(__m512d) (V1), (int) (SCALE)) + +#define _mm512_i64scatter_ps(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv16sf ((void *) (ADDR), (__mmask8)0xFF, \ + (__v8di)(__m512i) (INDEX), \ + (__v8sf)(__m256) (V1), (int) (SCALE)) + +#define _mm512_mask_i64scatter_ps(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv16sf ((void *) (ADDR), (__mmask16) (MASK), \ + (__v8di)(__m512i) (INDEX), \ + (__v8sf)(__m256) (V1), (int) (SCALE)) + +#define _mm512_i64scatter_pd(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv8df ((void *) (ADDR), (__mmask8)0xFF, \ + (__v8di)(__m512i) (INDEX), \ + (__v8df)(__m512d) (V1), (int) (SCALE)) + +#define _mm512_mask_i64scatter_pd(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv8df ((void *) (ADDR), (__mmask8) (MASK), \ + (__v8di)(__m512i) (INDEX), \ + (__v8df)(__m512d) (V1), (int) (SCALE)) + +#define _mm512_i32scatter_epi32(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv16si ((void *) (ADDR), (__mmask16)0xFFFF, \ + (__v16si)(__m512i) (INDEX), \ + (__v16si)(__m512i) (V1), (int) (SCALE)) + +#define _mm512_mask_i32scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv16si ((void *) (ADDR), (__mmask16) (MASK), \ + (__v16si)(__m512i) (INDEX), \ + (__v16si)(__m512i) (V1), (int) (SCALE)) + +#define _mm512_i32scatter_epi64(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv8di ((void *) (ADDR), (__mmask8)0xFF, \ + (__v8si)(__m256i) (INDEX), \ + (__v8di)(__m512i) (V1), (int) (SCALE)) + +#define _mm512_mask_i32scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv8di ((void *) (ADDR), (__mmask8) (MASK), \ + (__v8si)(__m256i) (INDEX), \ + (__v8di)(__m512i) (V1), (int) (SCALE)) + +#define _mm512_i64scatter_epi32(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv16si ((void *) (ADDR), (__mmask8)0xFF, \ + (__v8di)(__m512i) (INDEX), \ + (__v8si)(__m256i) (V1), (int) (SCALE)) + +#define _mm512_mask_i64scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv16si ((void *) (ADDR), (__mmask8) (MASK), \ + (__v8di)(__m512i) (INDEX), \ + (__v8si)(__m256i) (V1), (int) (SCALE)) + +#define _mm512_i64scatter_epi64(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv8di ((void *) (ADDR), (__mmask8)0xFF, \ + (__v8di)(__m512i) (INDEX), \ + (__v8di)(__m512i) (V1), (int) (SCALE)) + +#define _mm512_mask_i64scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv8di ((void *) (ADDR), (__mmask8) (MASK), \ + (__v8di)(__m512i) (INDEX), \ + (__v8di)(__m512i) (V1), (int) (SCALE)) +#endif + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_compress_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_compress_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m512d __A) +{ + __builtin_ia32_compressstoredf512_mask ((__v8df *) __P, (__v8df) __A, + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_compress_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_compress_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_compressstoreu_ps (void *__P, __mmask16 __U, __m512 __A) +{ + __builtin_ia32_compressstoresf512_mask ((__v16sf *) __P, (__v16sf) __A, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_compress_epi64 (__m512i __W, __mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_compress_epi64 (__mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m512i __A) +{ + __builtin_ia32_compressstoredi512_mask ((__v8di *) __P, (__v8di) __A, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_compress_epi32 (__m512i __W, __mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A) +{ + __builtin_ia32_compressstoresi512_mask ((__v16si *) __P, (__v16si) __A, + (__mmask16) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_expand_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_expanddf512_maskz ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_expandloadu_pd (__m512d __W, __mmask8 __U, void const *__P) +{ + return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *) __P, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_expandloadu_pd (__mmask8 __U, void const *__P) +{ + return (__m512d) __builtin_ia32_expandloaddf512_maskz ((const __v8df *) __P, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_expand_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_expand_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_expandsf512_maskz ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_expandloadu_ps (__m512 __W, __mmask16 __U, void const *__P) +{ + return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *) __P, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_expandloadu_ps (__mmask16 __U, void const *__P) +{ + return (__m512) __builtin_ia32_expandloadsf512_maskz ((const __v16sf *) __P, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_expand_epi64 (__m512i __W, __mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_expand_epi64 (__mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_expanddi512_maskz ((__v8di) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_expandloadu_epi64 (__m512i __W, __mmask8 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *) __P, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) +{ + return (__m512i) + __builtin_ia32_expandloaddi512_maskz ((const __v8di *) __P, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_expand_epi32 (__m512i __W, __mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_expandsi512_maskz ((__v16si) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_expandloadu_epi32 (__m512i __W, __mmask16 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *) __P, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_expandloadu_epi32 (__mmask16 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_expandloadsi512_maskz ((const __v16si *) __P, + (__v16si) + _mm512_setzero_si512 + (), (__mmask16) __U); +} + +/* Mask arithmetic operations */ +#define _kand_mask16 _mm512_kand +#define _kandn_mask16 _mm512_kandn +#define _knot_mask16 _mm512_knot +#define _kor_mask16 _mm512_kor +#define _kxnor_mask16 _mm512_kxnor +#define _kxor_mask16 _mm512_kxor + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortest_mask16_u8 (__mmask16 __A, __mmask16 __B, unsigned char *__CF) +{ + *__CF = (unsigned char) __builtin_ia32_kortestchi (__A, __B); + return (unsigned char) __builtin_ia32_kortestzhi (__A, __B); +} + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortestz_mask16_u8 (__mmask16 __A, __mmask16 __B) +{ + return (unsigned char) __builtin_ia32_kortestzhi ((__mmask16) __A, + (__mmask16) __B); +} + +extern __inline unsigned char +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kortestc_mask16_u8 (__mmask16 __A, __mmask16 __B) +{ + return (unsigned char) __builtin_ia32_kortestchi ((__mmask16) __A, + (__mmask16) __B); +} + +extern __inline unsigned int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_cvtmask16_u32 (__mmask16 __A) +{ + return (unsigned int) __builtin_ia32_kmovw ((__mmask16 ) __A); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_cvtu32_mask16 (unsigned int __A) +{ + return (__mmask16) __builtin_ia32_kmovw ((__mmask16 ) __A); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_load_mask16 (__mmask16 *__A) +{ + return (__mmask16) __builtin_ia32_kmovw (*(__mmask16 *) __A); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_store_mask16 (__mmask16 *__A, __mmask16 __B) +{ + *(__mmask16 *) __A = __builtin_ia32_kmovw (__B); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_kand (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_kandn (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, + (__mmask16) __B); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_kor (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_kortestz (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_kortestzhi ((__mmask16) __A, + (__mmask16) __B); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_kortestc (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_kortestchi ((__mmask16) __A, + (__mmask16) __B); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_kxnor (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_kxor (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_knot (__mmask16 __A) +{ + return (__mmask16) __builtin_ia32_knothi ((__mmask16) __A); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_kunpackb (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_kunpackb_mask16 (__mmask8 __A, __mmask8 __B) +{ + return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_inserti32x4 (__mmask16 __B, __m512i __C, __m128i __D, + const int __imm) +{ + return (__m512i) __builtin_ia32_inserti32x4_mask ((__v16si) __C, + (__v4si) __D, + __imm, + (__v16si) + _mm512_setzero_si512 (), + __B); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_insertf32x4 (__mmask16 __B, __m512 __C, __m128 __D, + const int __imm) +{ + return (__m512) __builtin_ia32_insertf32x4_mask ((__v16sf) __C, + (__v4sf) __D, + __imm, + (__v16sf) + _mm512_setzero_ps (), __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_inserti32x4 (__m512i __A, __mmask16 __B, __m512i __C, + __m128i __D, const int __imm) +{ + return (__m512i) __builtin_ia32_inserti32x4_mask ((__v16si) __C, + (__v4si) __D, + __imm, + (__v16si) __A, + __B); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_insertf32x4 (__m512 __A, __mmask16 __B, __m512 __C, + __m128 __D, const int __imm) +{ + return (__m512) __builtin_ia32_insertf32x4_mask ((__v16sf) __C, + (__v4sf) __D, + __imm, + (__v16sf) __A, __B); +} +#else +#define _mm512_maskz_insertf32x4(A, X, Y, C) \ + ((__m512) __builtin_ia32_insertf32x4_mask ((__v16sf)(__m512) (X), \ + (__v4sf)(__m128) (Y), (int) (C), (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(A))) + +#define _mm512_maskz_inserti32x4(A, X, Y, C) \ + ((__m512i) __builtin_ia32_inserti32x4_mask ((__v16si)(__m512i) (X), \ + (__v4si)(__m128i) (Y), (int) (C), (__v16si)_mm512_setzero_si512 (), \ + (__mmask16)(A))) + +#define _mm512_mask_insertf32x4(A, B, X, Y, C) \ + ((__m512) __builtin_ia32_insertf32x4_mask ((__v16sf)(__m512) (X), \ + (__v4sf)(__m128) (Y), (int) (C), (__v16sf)(__m512) (A), \ + (__mmask16)(B))) + +#define _mm512_mask_inserti32x4(A, B, X, Y, C) \ + ((__m512i) __builtin_ia32_inserti32x4_mask ((__v16si)(__m512i) (X), \ + (__v4si)(__m128i) (Y), (int) (C), (__v16si)(__m512i) (A), \ + (__mmask16)(B))) +#endif + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_epu64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_epu64 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_undefined_epi32 (), + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) __W, __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A, + (__v8di) __B, + (__v8di) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_epu32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_epu32 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) + _mm512_setzero_si512 (), + __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A, + (__v16si) __B, + (__v16si) __W, __M); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_unpacklo_ps (__m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_unpcklps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_unpacklo_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_unpcklps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_unpcklps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_maxsd_round ((__v2df) __A, + (__v2df) __B, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_maxss_round ((__v4sf) __A, + (__v4sf) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_round_sd (__m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_minsd_round ((__v2df) __A, + (__v2df) __B, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_round_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_round_sd (__mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_round_ss (__m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_minss_round ((__v4sf) __A, + (__v4sf) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_round_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_round_ss (__mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, __R); +} + +#else +#define _mm_max_round_sd(A, B, C) \ + (__m128d)__builtin_ia32_maxsd_round(A, B, C) + +#define _mm_mask_max_round_sd(W, U, A, B, C) \ + (__m128d)__builtin_ia32_maxsd_mask_round(A, B, W, U, C) + +#define _mm_maskz_max_round_sd(U, A, B, C) \ + (__m128d)__builtin_ia32_maxsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) + +#define _mm_max_round_ss(A, B, C) \ + (__m128)__builtin_ia32_maxss_round(A, B, C) + +#define _mm_mask_max_round_ss(W, U, A, B, C) \ + (__m128)__builtin_ia32_maxss_mask_round(A, B, W, U, C) + +#define _mm_maskz_max_round_ss(U, A, B, C) \ + (__m128)__builtin_ia32_maxss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) + +#define _mm_min_round_sd(A, B, C) \ + (__m128d)__builtin_ia32_minsd_round(A, B, C) + +#define _mm_mask_min_round_sd(W, U, A, B, C) \ + (__m128d)__builtin_ia32_minsd_mask_round(A, B, W, U, C) + +#define _mm_maskz_min_round_sd(U, A, B, C) \ + (__m128d)__builtin_ia32_minsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U, C) + +#define _mm_min_round_ss(A, B, C) \ + (__m128)__builtin_ia32_minss_round(A, B, C) + +#define _mm_mask_min_round_ss(W, U, A, B, C) \ + (__m128)__builtin_ia32_minss_mask_round(A, B, W, U, C) + +#define _mm_maskz_min_round_ss(U, A, B, C) \ + (__m128)__builtin_ia32_minss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U, C) + +#endif + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_blend_pd (__mmask8 __U, __m512d __A, __m512d __W) +{ + return (__m512d) __builtin_ia32_blendmpd_512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_blend_ps (__mmask16 __U, __m512 __A, __m512 __W) +{ + return (__m512) __builtin_ia32_blendmps_512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_blend_epi64 (__mmask8 __U, __m512i __A, __m512i __W) +{ + return (__m512i) __builtin_ia32_blendmq_512_mask ((__v8di) __A, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_blend_epi32 (__mmask16 __U, __m512i __A, __m512i __W) +{ + return (__m512i) __builtin_ia32_blendmd_512_mask ((__v16si) __A, + (__v16si) __W, + (__mmask16) __U); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmadd_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W, + (__v2df) __A, + (__v2df) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmadd_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmsub_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W, + (__v2df) __A, + -(__v2df) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmsub_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W, + (__v4sf) __A, + -(__v4sf) __B, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmadd_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W, + -(__v2df) __A, + (__v2df) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmadd_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W, + -(__v4sf) __A, + (__v4sf) __B, + __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmsub_round_sd (__m128d __W, __m128d __A, __m128d __B, const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_round ((__v2df) __W, + -(__v2df) __A, + -(__v2df) __B, + __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmsub_round_ss (__m128 __W, __m128 __A, __m128 __B, const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_round ((__v4sf) __W, + -(__v4sf) __A, + -(__v4sf) __B, + __R); +} +#else +#define _mm_fmadd_round_sd(A, B, C, R) \ + (__m128d)__builtin_ia32_vfmaddsd3_round(A, B, C, R) + +#define _mm_fmadd_round_ss(A, B, C, R) \ + (__m128)__builtin_ia32_vfmaddss3_round(A, B, C, R) + +#define _mm_fmsub_round_sd(A, B, C, R) \ + (__m128d)__builtin_ia32_vfmaddsd3_round(A, B, -(C), R) + +#define _mm_fmsub_round_ss(A, B, C, R) \ + (__m128)__builtin_ia32_vfmaddss3_round(A, B, -(C), R) + +#define _mm_fnmadd_round_sd(A, B, C, R) \ + (__m128d)__builtin_ia32_vfmaddsd3_round(A, -(B), C, R) + +#define _mm_fnmadd_round_ss(A, B, C, R) \ + (__m128)__builtin_ia32_vfmaddss3_round(A, -(B), C, R) + +#define _mm_fnmsub_round_sd(A, B, C, R) \ + (__m128d)__builtin_ia32_vfmaddsd3_round(A, -(B), -(C), R) + +#define _mm_fnmsub_round_ss(A, B, C, R) \ + (__m128)__builtin_ia32_vfmaddss3_round(A, -(B), -(C), R) +#endif + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W, + (__v2df) __A, + (__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmadd_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W, + (__v2df) __A, + (__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmadd_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmadd_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W, + (__v2df) __A, + (__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmadd_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W, + (__v2df) __A, + -(__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, + (__v4sf) __A, + -(__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmsub_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U) +{ + return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W, + (__v2df) __A, + (__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmsub_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U) +{ + return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmsub_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W, + (__v2df) __A, + -(__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmsub_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W, + (__v4sf) __A, + -(__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W, + -(__v2df) __A, + (__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, + -(__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmadd_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W, + -(__v2df) __A, + (__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmadd_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W, + -(__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W, + -(__v2df) __A, + (__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W, + -(__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W, + -(__v2df) __A, + -(__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, + -(__v4sf) __A, + -(__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmsub_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U) +{ + return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W, + -(__v2df) __A, + (__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmsub_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U) +{ + return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W, + -(__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W, + -(__v2df) __A, + -(__v2df) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W, + -(__v4sf) __A, + -(__v4sf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmadd_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W, + (__v2df) __A, + (__v2df) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmadd_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmadd_round_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W, + (__v2df) __A, + (__v2df) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmadd_round_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U, + const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmadd_round_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W, + (__v2df) __A, + (__v2df) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmadd_round_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmsub_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W, + (__v2df) __A, + -(__v2df) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmsub_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, + (__v4sf) __A, + -(__v4sf) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmsub_round_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W, + (__v2df) __A, + (__v2df) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmsub_round_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U, + const int __R) +{ + return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W, + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmsub_round_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W, + (__v2df) __A, + -(__v2df) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmsub_round_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W, + (__v4sf) __A, + -(__v4sf) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmadd_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W, + -(__v2df) __A, + (__v2df) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmadd_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, + -(__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmadd_round_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W, + -(__v2df) __A, + (__v2df) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmadd_round_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U, + const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W, + -(__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmadd_round_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W, + -(__v2df) __A, + (__v2df) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmadd_round_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W, + -(__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmsub_round_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_mask ((__v2df) __W, + -(__v2df) __A, + -(__v2df) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmsub_round_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W, + -(__v4sf) __A, + -(__v4sf) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmsub_round_sd (__m128d __W, __m128d __A, __m128d __B, __mmask8 __U, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W, + -(__v2df) __A, + (__v2df) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmsub_round_ss (__m128 __W, __m128 __A, __m128 __B, __mmask8 __U, + const int __R) +{ + return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W, + -(__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmsub_round_sd (__mmask8 __U, __m128d __W, __m128d __A, __m128d __B, + const int __R) +{ + return (__m128d) __builtin_ia32_vfmaddsd3_maskz ((__v2df) __W, + -(__v2df) __A, + -(__v2df) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmsub_round_ss (__mmask8 __U, __m128 __W, __m128 __A, __m128 __B, + const int __R) +{ + return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __W, + -(__v4sf) __A, + -(__v4sf) __B, + (__mmask8) __U, __R); +} +#else +#define _mm_mask_fmadd_round_sd(A, U, B, C, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_mask (A, B, C, U, R) + +#define _mm_mask_fmadd_round_ss(A, U, B, C, R) \ + (__m128) __builtin_ia32_vfmaddss3_mask (A, B, C, U, R) + +#define _mm_mask3_fmadd_round_sd(A, B, C, U, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_mask3 (A, B, C, U, R) + +#define _mm_mask3_fmadd_round_ss(A, B, C, U, R) \ + (__m128) __builtin_ia32_vfmaddss3_mask3 (A, B, C, U, R) + +#define _mm_maskz_fmadd_round_sd(U, A, B, C, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_maskz (A, B, C, U, R) + +#define _mm_maskz_fmadd_round_ss(U, A, B, C, R) \ + (__m128) __builtin_ia32_vfmaddss3_maskz (A, B, C, U, R) + +#define _mm_mask_fmsub_round_sd(A, U, B, C, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_mask (A, B, -(C), U, R) + +#define _mm_mask_fmsub_round_ss(A, U, B, C, R) \ + (__m128) __builtin_ia32_vfmaddss3_mask (A, B, -(C), U, R) + +#define _mm_mask3_fmsub_round_sd(A, B, C, U, R) \ + (__m128d) __builtin_ia32_vfmsubsd3_mask3 (A, B, C, U, R) + +#define _mm_mask3_fmsub_round_ss(A, B, C, U, R) \ + (__m128) __builtin_ia32_vfmsubss3_mask3 (A, B, C, U, R) + +#define _mm_maskz_fmsub_round_sd(U, A, B, C, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_maskz (A, B, -(C), U, R) + +#define _mm_maskz_fmsub_round_ss(U, A, B, C, R) \ + (__m128) __builtin_ia32_vfmaddss3_maskz (A, B, -(C), U, R) + +#define _mm_mask_fnmadd_round_sd(A, U, B, C, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_mask (A, -(B), C, U, R) + +#define _mm_mask_fnmadd_round_ss(A, U, B, C, R) \ + (__m128) __builtin_ia32_vfmaddss3_mask (A, -(B), C, U, R) + +#define _mm_mask3_fnmadd_round_sd(A, B, C, U, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_mask3 (A, -(B), C, U, R) + +#define _mm_mask3_fnmadd_round_ss(A, B, C, U, R) \ + (__m128) __builtin_ia32_vfmaddss3_mask3 (A, -(B), C, U, R) + +#define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_maskz (A, -(B), C, U, R) + +#define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) \ + (__m128) __builtin_ia32_vfmaddss3_maskz (A, -(B), C, U, R) + +#define _mm_mask_fnmsub_round_sd(A, U, B, C, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_mask (A, -(B), -(C), U, R) + +#define _mm_mask_fnmsub_round_ss(A, U, B, C, R) \ + (__m128) __builtin_ia32_vfmaddss3_mask (A, -(B), -(C), U, R) + +#define _mm_mask3_fnmsub_round_sd(A, B, C, U, R) \ + (__m128d) __builtin_ia32_vfmsubsd3_mask3 (A, -(B), C, U, R) + +#define _mm_mask3_fnmsub_round_ss(A, B, C, U, R) \ + (__m128) __builtin_ia32_vfmsubss3_mask3 (A, -(B), C, U, R) + +#define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) \ + (__m128d) __builtin_ia32_vfmaddsd3_maskz (A, -(B), -(C), U, R) + +#define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) \ + (__m128) __builtin_ia32_vfmaddss3_maskz (A, -(B), -(C), U, R) +#endif + +#ifdef __OPTIMIZE__ +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comi_round_ss (__m128 __A, __m128 __B, const int __P, const int __R) +{ + return __builtin_ia32_vcomiss ((__v4sf) __A, (__v4sf) __B, __P, __R); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comi_round_sd (__m128d __A, __m128d __B, const int __P, const int __R) +{ + return __builtin_ia32_vcomisd ((__v2df) __A, (__v2df) __B, __P, __R); +} +#else +#define _mm_comi_round_ss(A, B, C, D)\ +__builtin_ia32_vcomiss(A, B, C, D) +#define _mm_comi_round_sd(A, B, C, D)\ +__builtin_ia32_vcomisd(A, B, C, D) +#endif + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sqrt_pd (__m512d __A) +{ + return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sqrt_ps (__m512 __A) +{ + return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sqrt_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sqrt_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_sqrtps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_add_pd (__m512d __A, __m512d __B) +{ + return (__m512d) ((__v8df)__A + (__v8df)__B); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_add_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_add_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_addpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_add_ps (__m512 __A, __m512 __B) +{ + return (__m512) ((__v16sf)__A + (__v16sf)__B); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_add_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_add_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_addps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_addsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_addss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sub_pd (__m512d __A, __m512d __B) +{ + return (__m512d) ((__v8df)__A - (__v8df)__B); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sub_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sub_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_subpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sub_ps (__m512 __A, __m512 __B) +{ + return (__m512) ((__v16sf)__A - (__v16sf)__B); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sub_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sub_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_subps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_subsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_subss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mul_pd (__m512d __A, __m512d __B) +{ + return (__m512d) ((__v8df)__A * (__v8df)__B); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mul_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mul_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_mulpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mul_ps (__m512 __A, __m512 __B) +{ + return (__m512) ((__v16sf)__A * (__v16sf)__B); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mul_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mul_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_mulps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mul_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) +{ + return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mul_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_mulsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mul_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) +{ + return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mul_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_mulss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_div_pd (__m512d __M, __m512d __V) +{ + return (__m512d) ((__v8df)__M / (__v8df)__V); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_div_pd (__m512d __W, __mmask8 __U, __m512d __M, __m512d __V) +{ + return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M, + (__v8df) __V, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_div_pd (__mmask8 __U, __m512d __M, __m512d __V) +{ + return (__m512d) __builtin_ia32_divpd512_mask ((__v8df) __M, + (__v8df) __V, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_div_ps (__m512 __A, __m512 __B) +{ + return (__m512) ((__v16sf)__A / (__v16sf)__B); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_div_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_div_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_divps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_div_sd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) +{ + return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_div_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_divsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_div_ss (__m128 __W, __mmask8 __U, __m128 __A, + __m128 __B) +{ + return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_div_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_divss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_pd (__m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_ps (__m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_maxsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_maxss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_pd (__m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_ps (__m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_minsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_minss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_scalef_pd (__m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_scalef_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_scalef_ps (__m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_scalef_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_scalef_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_scalefsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_scalef_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_scalefss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmadd_pd (__m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmadd_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmadd_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmadd_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmadd_ps (__m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmadd_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmadd_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmadd_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmsub_pd (__m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmsub_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmsub_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) +{ + return (__m512d) __builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmsub_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmsubpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmsub_ps (__m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmsub_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmsub_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) +{ + return (__m512) __builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmsub_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmsubps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmaddsub_pd (__m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmaddsub_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmaddsub_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmaddsub_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmaddsub_ps (__m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmaddsub_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmaddsub_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmaddsub_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmsubadd_pd (__m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmsubadd_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmsubadd_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) +{ + return (__m512d) __builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmsubadd_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmsubadd_ps (__m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmsubadd_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmsubadd_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) +{ + return (__m512) __builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmsubadd_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fnmadd_pd (__m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fnmadd_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fnmadd_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) +{ + return (__m512d) __builtin_ia32_vfnmaddpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fnmadd_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfnmaddpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fnmadd_ps (__m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fnmadd_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fnmadd_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) +{ + return (__m512) __builtin_ia32_vfnmaddps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fnmadd_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfnmaddps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fnmsub_pd (__m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fnmsub_pd (__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fnmsub_pd (__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) +{ + return (__m512d) __builtin_ia32_vfnmsubpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fnmsub_pd (__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfnmsubpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fnmsub_ps (__m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fnmsub_ps (__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fnmsub_ps (__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) +{ + return (__m512) __builtin_ia32_vfnmsubps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fnmsub_ps (__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfnmsubps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvttpd_epi32 (__m512d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvttpd_epu32 (__m512d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvttpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtpd_epi32 (__m512d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtpd_epu32 (__m512d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvttps_epi32 (__m512 __A) +{ + return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A) +{ + return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A) +{ + return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvttps_epu32 (__m512 __A) +{ + return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A) +{ + return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A) +{ + return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtps_epi32 (__m512 __A) +{ + return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A) +{ + return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A) +{ + return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtps_epu32 (__m512 __A) +{ + return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_undefined_epi32 (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A) +{ + return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtps_epu32 (__mmask16 __U, __m512 __A) +{ + return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline double +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtsd_f64 (__m512d __A) +{ + return __A[0]; +} + +extern __inline float +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtss_f32 (__m512 __A) +{ + return __A[0]; +} + +#ifdef __x86_64__ +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtu64_ss (__m128 __A, unsigned long long __B) +{ + return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtu64_sd (__m128d __A, unsigned long long __B) +{ + return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B, + _MM_FROUND_CUR_DIRECTION); +} +#endif + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtu32_ss (__m128 __A, unsigned __B) +{ + return (__m128) __builtin_ia32_cvtusi2ss32 ((__v4sf) __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi32_ps (__m512i __A) +{ + return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A) +{ + return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A) +{ + return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepu32_ps (__m512i __A) +{ + return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A) +{ + return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A) +{ + return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fixupimm_pd (__m512d __A, __m512d __B, __m512i __C, const int __imm) +{ + return (__m512d) __builtin_ia32_fixupimmpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8di) __C, + __imm, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fixupimm_pd (__m512d __A, __mmask8 __U, __m512d __B, + __m512i __C, const int __imm) +{ + return (__m512d) __builtin_ia32_fixupimmpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8di) __C, + __imm, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fixupimm_pd (__mmask8 __U, __m512d __A, __m512d __B, + __m512i __C, const int __imm) +{ + return (__m512d) __builtin_ia32_fixupimmpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8di) __C, + __imm, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fixupimm_ps (__m512 __A, __m512 __B, __m512i __C, const int __imm) +{ + return (__m512) __builtin_ia32_fixupimmps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16si) __C, + __imm, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fixupimm_ps (__m512 __A, __mmask16 __U, __m512 __B, + __m512i __C, const int __imm) +{ + return (__m512) __builtin_ia32_fixupimmps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16si) __C, + __imm, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fixupimm_ps (__mmask16 __U, __m512 __A, __m512 __B, + __m512i __C, const int __imm) +{ + return (__m512) __builtin_ia32_fixupimmps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16si) __C, + __imm, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fixupimm_sd (__m128d __A, __m128d __B, __m128i __C, const int __imm) +{ + return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, __imm, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fixupimm_sd (__m128d __A, __mmask8 __U, __m128d __B, + __m128i __C, const int __imm) +{ + return (__m128d) __builtin_ia32_fixupimmsd_mask ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, __imm, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fixupimm_sd (__mmask8 __U, __m128d __A, __m128d __B, + __m128i __C, const int __imm) +{ + return (__m128d) __builtin_ia32_fixupimmsd_maskz ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, + __imm, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fixupimm_ss (__m128 __A, __m128 __B, __m128i __C, const int __imm) +{ + return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, __imm, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fixupimm_ss (__m128 __A, __mmask8 __U, __m128 __B, + __m128i __C, const int __imm) +{ + return (__m128) __builtin_ia32_fixupimmss_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, __imm, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fixupimm_ss (__mmask8 __U, __m128 __A, __m128 __B, + __m128i __C, const int __imm) +{ + return (__m128) __builtin_ia32_fixupimmss_maskz ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, __imm, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} +#else +#define _mm512_fixupimm_pd(X, Y, Z, C) \ + ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X), \ + (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C), \ + (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_fixupimm_pd(X, U, Y, Z, C) \ + ((__m512d)__builtin_ia32_fixupimmpd512_mask ((__v8df)(__m512d)(X), \ + (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C), \ + (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_fixupimm_pd(U, X, Y, Z, C) \ + ((__m512d)__builtin_ia32_fixupimmpd512_maskz ((__v8df)(__m512d)(X), \ + (__v8df)(__m512d)(Y), (__v8di)(__m512i)(Z), (int)(C), \ + (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_fixupimm_ps(X, Y, Z, C) \ + ((__m512)__builtin_ia32_fixupimmps512_mask ((__v16sf)(__m512)(X), \ + (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C), \ + (__mmask16)(-1), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_fixupimm_ps(X, U, Y, Z, C) \ + ((__m512)__builtin_ia32_fixupimmps512_mask ((__v16sf)(__m512)(X), \ + (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C), \ + (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_fixupimm_ps(U, X, Y, Z, C) \ + ((__m512)__builtin_ia32_fixupimmps512_maskz ((__v16sf)(__m512)(X), \ + (__v16sf)(__m512)(Y), (__v16si)(__m512i)(Z), (int)(C), \ + (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_fixupimm_sd(X, Y, Z, C) \ + ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), \ + (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_fixupimm_sd(X, U, Y, Z, C) \ + ((__m128d)__builtin_ia32_fixupimmsd_mask ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), \ + (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_fixupimm_sd(U, X, Y, Z, C) \ + ((__m128d)__builtin_ia32_fixupimmsd_maskz ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (__v2di)(__m128i)(Z), (int)(C), \ + (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_fixupimm_ss(X, Y, Z, C) \ + ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), \ + (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_fixupimm_ss(X, U, Y, Z, C) \ + ((__m128)__builtin_ia32_fixupimmss_mask ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), \ + (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_fixupimm_ss(U, X, Y, Z, C) \ + ((__m128)__builtin_ia32_fixupimmss_maskz ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (__v4si)(__m128i)(Z), (int)(C), \ + (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) +#endif + +#ifdef __x86_64__ +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_u64 (__m128 __A) +{ + return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf) + __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttss_u64 (__m128 __A) +{ + return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf) + __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttss_i64 (__m128 __A) +{ + return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, + _MM_FROUND_CUR_DIRECTION); +} +#endif /* __x86_64__ */ + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtsi512_si32 (__m512i __A) +{ + __v16si __B = (__v16si) __A; + return __B[0]; +} + +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_u32 (__m128 __A) +{ + return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttss_u32 (__m128 __A) +{ + return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttss_i32 (__m128 __A) +{ + return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_i32 (__m128d __A) +{ + return (int) __builtin_ia32_cvtsd2si ((__v2df) __A); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_i32 (__m128 __A) +{ + return (int) __builtin_ia32_cvtss2si ((__v4sf) __A); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvti32_sd (__m128d __A, int __B) +{ + return (__m128d) __builtin_ia32_cvtsi2sd ((__v2df) __A, __B); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvti32_ss (__m128 __A, int __B) +{ + return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B); +} + +#ifdef __x86_64__ +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_u64 (__m128d __A) +{ + return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df) + __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsd_u64 (__m128d __A) +{ + return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df) + __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsd_i64 (__m128d __A) +{ + return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_i64 (__m128d __A) +{ + return (long long) __builtin_ia32_cvtsd2si64 ((__v2df) __A); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_i64 (__m128 __A) +{ + return (long long) __builtin_ia32_cvtss2si64 ((__v4sf) __A); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvti64_sd (__m128d __A, long long __B) +{ + return (__m128d) __builtin_ia32_cvtsi642sd ((__v2df) __A, __B); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvti64_ss (__m128 __A, long long __B) +{ + return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B); +} +#endif /* __x86_64__ */ + +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_u32 (__m128d __A) +{ + return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsd_u32 (__m128d __A) +{ + return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsd_i32 (__m128d __A) +{ + return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtps_pd (__m256 __A) +{ + return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A) +{ + return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A) +{ + return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtph_ps (__m256i __A) +{ + return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A) +{ + return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A) +{ + return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtpd_ps (__m512d __A) +{ + return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, + (__v8sf) + _mm256_undefined_ps (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A) +{ + return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, + (__v8sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A) +{ + return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getexp_ps (__m512 __A) +{ + return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getexp_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getexp_pd (__m512d __A) +{ + return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getexp_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getexp_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_getexpss128_round ((__v4sf) __A, + (__v4sf) __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getexp_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_getexpss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_getexpss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getexp_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_getexpsd128_round ((__v2df) __A, + (__v2df) __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_getexpsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_getexpsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getmant_pd (__m512d __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A, + (__C << 2) | __B, + _mm512_undefined_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getmant_pd (__m512d __W, __mmask8 __U, __m512d __A, + _MM_MANTISSA_NORM_ENUM __B, _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A, + (__C << 2) | __B, + (__v8df) __W, __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getmant_pd (__mmask8 __U, __m512d __A, + _MM_MANTISSA_NORM_ENUM __B, _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m512d) __builtin_ia32_getmantpd512_mask ((__v8df) __A, + (__C << 2) | __B, + (__v8df) + _mm512_setzero_pd (), + __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getmant_ps (__m512 __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A, + (__C << 2) | __B, + _mm512_undefined_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getmant_ps (__m512 __W, __mmask16 __U, __m512 __A, + _MM_MANTISSA_NORM_ENUM __B, _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A, + (__C << 2) | __B, + (__v16sf) __W, __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getmant_ps (__mmask16 __U, __m512 __A, + _MM_MANTISSA_NORM_ENUM __B, _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m512) __builtin_ia32_getmantps512_mask ((__v16sf) __A, + (__C << 2) | __B, + (__v16sf) + _mm512_setzero_ps (), + __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getmant_sd (__m128d __A, __m128d __B, _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D) +{ + return (__m128d) __builtin_ia32_getmantsd_round ((__v2df) __A, + (__v2df) __B, + (__D << 2) | __C, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getmant_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B, + _MM_MANTISSA_NORM_ENUM __C, _MM_MANTISSA_SIGN_ENUM __D) +{ + return (__m128d) __builtin_ia32_getmantsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__D << 2) | __C, + (__v2df) __W, + __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getmant_sd (__mmask8 __U, __m128d __A, __m128d __B, + _MM_MANTISSA_NORM_ENUM __C, _MM_MANTISSA_SIGN_ENUM __D) +{ + return (__m128d) __builtin_ia32_getmantsd_mask_round ((__v2df) __A, + (__v2df) __B, + (__D << 2) | __C, + (__v2df) + _mm_setzero_pd(), + __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getmant_ss (__m128 __A, __m128 __B, _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D) +{ + return (__m128) __builtin_ia32_getmantss_round ((__v4sf) __A, + (__v4sf) __B, + (__D << 2) | __C, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getmant_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + _MM_MANTISSA_NORM_ENUM __C, _MM_MANTISSA_SIGN_ENUM __D) +{ + return (__m128) __builtin_ia32_getmantss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__D << 2) | __C, + (__v4sf) __W, + __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getmant_ss (__mmask8 __U, __m128 __A, __m128 __B, + _MM_MANTISSA_NORM_ENUM __C, _MM_MANTISSA_SIGN_ENUM __D) +{ + return (__m128) __builtin_ia32_getmantss_mask_round ((__v4sf) __A, + (__v4sf) __B, + (__D << 2) | __C, + (__v4sf) + _mm_setzero_ps(), + __U, + _MM_FROUND_CUR_DIRECTION); +} + +#else +#define _mm512_getmant_pd(X, B, C) \ + ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X), \ + (int)(((C)<<2) | (B)), \ + (__v8df)_mm512_undefined_pd(), \ + (__mmask8)-1,\ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_getmant_pd(W, U, X, B, C) \ + ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X), \ + (int)(((C)<<2) | (B)), \ + (__v8df)(__m512d)(W), \ + (__mmask8)(U),\ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_getmant_pd(U, X, B, C) \ + ((__m512d)__builtin_ia32_getmantpd512_mask ((__v8df)(__m512d)(X), \ + (int)(((C)<<2) | (B)), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(U),\ + _MM_FROUND_CUR_DIRECTION)) +#define _mm512_getmant_ps(X, B, C) \ + ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X), \ + (int)(((C)<<2) | (B)), \ + (__v16sf)_mm512_undefined_ps(), \ + (__mmask16)-1,\ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_getmant_ps(W, U, X, B, C) \ + ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X), \ + (int)(((C)<<2) | (B)), \ + (__v16sf)(__m512)(W), \ + (__mmask16)(U),\ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_getmant_ps(U, X, B, C) \ + ((__m512)__builtin_ia32_getmantps512_mask ((__v16sf)(__m512)(X), \ + (int)(((C)<<2) | (B)), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(U),\ + _MM_FROUND_CUR_DIRECTION)) +#define _mm_getmant_sd(X, Y, C, D) \ + ((__m128d)__builtin_ia32_getmantsd_round ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), \ + (int)(((D)<<2) | (C)), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_getmant_sd(W, U, X, Y, C, D) \ + ((__m128d)__builtin_ia32_getmantsd_mask_round ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), \ + (int)(((D)<<2) | (C)), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U),\ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_getmant_sd(U, X, Y, C, D) \ + ((__m128d)__builtin_ia32_getmantsd_mask_round ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), \ + (int)(((D)<<2) | (C)), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U),\ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_getmant_ss(X, Y, C, D) \ + ((__m128)__builtin_ia32_getmantss_round ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), \ + (int)(((D)<<2) | (C)), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_getmant_ss(W, U, X, Y, C, D) \ + ((__m128)__builtin_ia32_getmantss_mask_round ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), \ + (int)(((D)<<2) | (C)), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U),\ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_getmant_ss(U, X, Y, C, D) \ + ((__m128)__builtin_ia32_getmantss_mask_round ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), \ + (int)(((D)<<2) | (C)), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U),\ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_getexp_ss(A, B) \ + ((__m128)__builtin_ia32_getexpss128_round((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_getexp_ss(W, U, A, B) \ + (__m128)__builtin_ia32_getexpss_mask_round(A, B, W, U,\ + _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_getexp_ss(U, A, B) \ + (__m128)__builtin_ia32_getexpss_mask_round(A, B, (__v4sf)_mm_setzero_ps(), U,\ + _MM_FROUND_CUR_DIRECTION) + +#define _mm_getexp_sd(A, B) \ + ((__m128d)__builtin_ia32_getexpsd128_round((__v2df)(__m128d)(A), (__v2df)(__m128d)(B),\ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_getexp_sd(W, U, A, B) \ + (__m128d)__builtin_ia32_getexpsd_mask_round(A, B, W, U,\ + _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_getexp_sd(U, A, B) \ + (__m128d)__builtin_ia32_getexpsd_mask_round(A, B, (__v2df)_mm_setzero_pd(), U,\ + _MM_FROUND_CUR_DIRECTION) + +#define _mm512_getexp_ps(A) \ + ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_undefined_ps(), (__mmask16)-1, _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_getexp_ps(W, U, A) \ + ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(W), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_getexp_ps(U, A) \ + ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_getexp_pd(A) \ + ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_undefined_pd(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_getexp_pd(W, U, A) \ + ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_getexp_pd(U, A) \ + ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_setzero_pd(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_roundscale_ps (__m512 __A, const int __imm) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, __imm, + (__v16sf) + _mm512_undefined_ps (), + -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_roundscale_ps (__m512 __A, __mmask16 __B, __m512 __C, + const int __imm) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __C, __imm, + (__v16sf) __A, + (__mmask16) __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_roundscale_ps (__mmask16 __A, __m512 __B, const int __imm) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __B, + __imm, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_roundscale_pd (__m512d __A, const int __imm) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, __imm, + (__v8df) + _mm512_undefined_pd (), + -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_roundscale_pd (__m512d __A, __mmask8 __B, __m512d __C, + const int __imm) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __C, __imm, + (__v8df) __A, + (__mmask8) __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_roundscale_pd (__mmask8 __A, __m512d __B, const int __imm) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __B, + __imm, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roundscale_ss (__m128 __A, __m128 __B, const int __imm) +{ + return (__m128) + __builtin_ia32_rndscaless_mask_round ((__v4sf) __A, + (__v4sf) __B, __imm, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_roundscale_ss (__m128 __A, __mmask8 __B, __m128 __C, __m128 __D, + const int __imm) +{ + return (__m128) + __builtin_ia32_rndscaless_mask_round ((__v4sf) __C, + (__v4sf) __D, __imm, + (__v4sf) __A, + (__mmask8) __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_roundscale_ss (__mmask8 __A, __m128 __B, __m128 __C, + const int __imm) +{ + return (__m128) + __builtin_ia32_rndscaless_mask_round ((__v4sf) __B, + (__v4sf) __C, __imm, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roundscale_sd (__m128d __A, __m128d __B, const int __imm) +{ + return (__m128d) + __builtin_ia32_rndscalesd_mask_round ((__v2df) __A, + (__v2df) __B, __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_roundscale_sd (__m128d __A, __mmask8 __B, __m128d __C, __m128d __D, + const int __imm) +{ + return (__m128d) + __builtin_ia32_rndscalesd_mask_round ((__v2df) __C, + (__v2df) __D, __imm, + (__v2df) __A, + (__mmask8) __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_roundscale_sd (__mmask8 __A, __m128d __B, __m128d __C, + const int __imm) +{ + return (__m128d) + __builtin_ia32_rndscalesd_mask_round ((__v2df) __B, + (__v2df) __C, __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __A, + _MM_FROUND_CUR_DIRECTION); +} + +#else +#define _mm512_roundscale_ps(A, B) \ + ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(A), (int)(B),\ + (__v16sf)_mm512_undefined_ps(), (__mmask16)(-1), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_mask_roundscale_ps(A, B, C, D) \ + ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(C), \ + (int)(D), \ + (__v16sf)(__m512)(A), \ + (__mmask16)(B), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_maskz_roundscale_ps(A, B, C) \ + ((__m512) __builtin_ia32_rndscaleps_mask ((__v16sf)(__m512)(B), \ + (int)(C), \ + (__v16sf)_mm512_setzero_ps(),\ + (__mmask16)(A), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_roundscale_pd(A, B) \ + ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(A), (int)(B),\ + (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_mask_roundscale_pd(A, B, C, D) \ + ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(C), \ + (int)(D), \ + (__v8df)(__m512d)(A), \ + (__mmask8)(B), _MM_FROUND_CUR_DIRECTION)) +#define _mm512_maskz_roundscale_pd(A, B, C) \ + ((__m512d) __builtin_ia32_rndscalepd_mask ((__v8df)(__m512d)(B), \ + (int)(C), \ + (__v8df)_mm512_setzero_pd(),\ + (__mmask8)(A), _MM_FROUND_CUR_DIRECTION)) +#define _mm_roundscale_ss(A, B, I) \ + ((__m128) \ + __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (A), \ + (__v4sf) (__m128) (B), \ + (int) (I), \ + (__v4sf) _mm_setzero_ps (), \ + (__mmask8) (-1), \ + _MM_FROUND_CUR_DIRECTION)) +#define _mm_mask_roundscale_ss(A, U, B, C, I) \ + ((__m128) \ + __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (B), \ + (__v4sf) (__m128) (C), \ + (int) (I), \ + (__v4sf) (__m128) (A), \ + (__mmask8) (U), \ + _MM_FROUND_CUR_DIRECTION)) +#define _mm_maskz_roundscale_ss(U, A, B, I) \ + ((__m128) \ + __builtin_ia32_rndscaless_mask_round ((__v4sf) (__m128) (A), \ + (__v4sf) (__m128) (B), \ + (int) (I), \ + (__v4sf) _mm_setzero_ps (), \ + (__mmask8) (U), \ + _MM_FROUND_CUR_DIRECTION)) +#define _mm_roundscale_sd(A, B, I) \ + ((__m128d) \ + __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (A), \ + (__v2df) (__m128d) (B), \ + (int) (I), \ + (__v2df) _mm_setzero_pd (), \ + (__mmask8) (-1), \ + _MM_FROUND_CUR_DIRECTION)) +#define _mm_mask_roundscale_sd(A, U, B, C, I) \ + ((__m128d) \ + __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (B), \ + (__v2df) (__m128d) (C), \ + (int) (I), \ + (__v2df) (__m128d) (A), \ + (__mmask8) (U), \ + _MM_FROUND_CUR_DIRECTION)) +#define _mm_maskz_roundscale_sd(U, A, B, I) \ + ((__m128d) \ + __builtin_ia32_rndscalesd_mask_round ((__v2df) (__m128d) (A), \ + (__v2df) (__m128d) (B), \ + (int) (I), \ + (__v2df) _mm_setzero_pd (), \ + (__mmask8) (U), \ + _MM_FROUND_CUR_DIRECTION)) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_pd_mask (__m512d __X, __m512d __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, __P, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_ps_mask (__m512 __X, __m512 __Y, const int __P) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, __P, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y, const int __P) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, __P, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, __P, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_sd_mask (__m128d __X, __m128d __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X, + (__v2df) __Y, __P, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_sd_mask (__mmask8 __M, __m128d __X, __m128d __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpsd_mask ((__v2df) __X, + (__v2df) __Y, __P, + (__mmask8) __M, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_ss_mask (__m128 __X, __m128 __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X, + (__v4sf) __Y, __P, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_ss_mask (__mmask8 __M, __m128 __X, __m128 __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpss_mask ((__v4sf) __X, + (__v4sf) __Y, __P, + (__mmask8) __M, + _MM_FROUND_CUR_DIRECTION); +} + +#else +#define _mm512_cmp_pd_mask(X, Y, P) \ + ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X), \ + (__v8df)(__m512d)(Y), (int)(P),\ + (__mmask8)-1,_MM_FROUND_CUR_DIRECTION)) + +#define _mm512_cmp_ps_mask(X, Y, P) \ + ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X), \ + (__v16sf)(__m512)(Y), (int)(P),\ + (__mmask16)-1,_MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_cmp_pd_mask(M, X, Y, P) \ + ((__mmask8) __builtin_ia32_cmppd512_mask ((__v8df)(__m512d)(X), \ + (__v8df)(__m512d)(Y), (int)(P),\ + (__mmask8)(M), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_cmp_ps_mask(M, X, Y, P) \ + ((__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf)(__m512)(X), \ + (__v16sf)(__m512)(Y), (int)(P),\ + (__mmask16)(M),_MM_FROUND_CUR_DIRECTION)) + +#define _mm_cmp_sd_mask(X, Y, P) \ + ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (int)(P),\ + (__mmask8)-1,_MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_cmp_sd_mask(M, X, Y, P) \ + ((__mmask8) __builtin_ia32_cmpsd_mask ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (int)(P),\ + M,_MM_FROUND_CUR_DIRECTION)) + +#define _mm_cmp_ss_mask(X, Y, P) \ + ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (int)(P), \ + (__mmask8)-1,_MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_cmp_ss_mask(M, X, Y, P) \ + ((__mmask8) __builtin_ia32_cmpss_mask ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (int)(P), \ + M,_MM_FROUND_CUR_DIRECTION)) +#endif + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpeq_pd_mask (__m512d __X, __m512d __Y) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, _CMP_EQ_OQ, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpeq_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, _CMP_EQ_OQ, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmplt_pd_mask (__m512d __X, __m512d __Y) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, _CMP_LT_OS, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmplt_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, _CMP_LT_OS, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmple_pd_mask (__m512d __X, __m512d __Y) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, _CMP_LE_OS, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmple_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, _CMP_LE_OS, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpunord_pd_mask (__m512d __X, __m512d __Y) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, _CMP_UNORD_Q, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpunord_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, _CMP_UNORD_Q, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpneq_pd_mask (__m512d __X, __m512d __Y) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, _CMP_NEQ_UQ, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpneq_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, _CMP_NEQ_UQ, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpnlt_pd_mask (__m512d __X, __m512d __Y) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, _CMP_NLT_US, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpnlt_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, _CMP_NLT_US, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpnle_pd_mask (__m512d __X, __m512d __Y) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, _CMP_NLE_US, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpnle_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, _CMP_NLE_US, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpord_pd_mask (__m512d __X, __m512d __Y) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, _CMP_ORD_Q, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpord_pd_mask (__mmask8 __U, __m512d __X, __m512d __Y) +{ + return (__mmask8) __builtin_ia32_cmppd512_mask ((__v8df) __X, + (__v8df) __Y, _CMP_ORD_Q, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpeq_ps_mask (__m512 __X, __m512 __Y) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, _CMP_EQ_OQ, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpeq_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, _CMP_EQ_OQ, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmplt_ps_mask (__m512 __X, __m512 __Y) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, _CMP_LT_OS, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmplt_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, _CMP_LT_OS, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmple_ps_mask (__m512 __X, __m512 __Y) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, _CMP_LE_OS, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmple_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, _CMP_LE_OS, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpunord_ps_mask (__m512 __X, __m512 __Y) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, _CMP_UNORD_Q, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpunord_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, _CMP_UNORD_Q, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpneq_ps_mask (__m512 __X, __m512 __Y) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, _CMP_NEQ_UQ, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpneq_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, _CMP_NEQ_UQ, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpnlt_ps_mask (__m512 __X, __m512 __Y) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, _CMP_NLT_US, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpnlt_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, _CMP_NLT_US, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpnle_ps_mask (__m512 __X, __m512 __Y) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, _CMP_NLE_US, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpnle_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, _CMP_NLE_US, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpord_ps_mask (__m512 __X, __m512 __Y) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, _CMP_ORD_Q, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpord_ps_mask (__mmask16 __U, __m512 __X, __m512 __Y) +{ + return (__mmask16) __builtin_ia32_cmpps512_mask ((__v16sf) __X, + (__v16sf) __Y, _CMP_ORD_Q, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_kmov (__mmask16 __A) +{ + return __builtin_ia32_kmovw (__A); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castpd_ps (__m512d __A) +{ + return (__m512) (__A); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castpd_si512 (__m512d __A) +{ + return (__m512i) (__A); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castps_pd (__m512 __A) +{ + return (__m512d) (__A); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castps_si512 (__m512 __A) +{ + return (__m512i) (__A); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castsi512_ps (__m512i __A) +{ + return (__m512) (__A); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castsi512_pd (__m512i __A) +{ + return (__m512d) (__A); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castpd512_pd128 (__m512d __A) +{ + return (__m128d)_mm512_extractf32x4_ps((__m512)__A, 0); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castps512_ps128 (__m512 __A) +{ + return _mm512_extractf32x4_ps(__A, 0); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castsi512_si128 (__m512i __A) +{ + return (__m128i)_mm512_extracti32x4_epi32((__m512i)__A, 0); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castpd512_pd256 (__m512d __A) +{ + return _mm512_extractf64x4_pd(__A, 0); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castps512_ps256 (__m512 __A) +{ + return (__m256)_mm512_extractf64x4_pd((__m512d)__A, 0); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castsi512_si256 (__m512i __A) +{ + return (__m256i)_mm512_extractf64x4_pd((__m512d)__A, 0); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castpd128_pd512 (__m128d __A) +{ + return (__m512d) __builtin_ia32_pd512_pd((__m128d)__A); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castps128_ps512 (__m128 __A) +{ + return (__m512) __builtin_ia32_ps512_ps((__m128)__A); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castsi128_si512 (__m128i __A) +{ + return (__m512i) __builtin_ia32_si512_si((__v4si)__A); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castpd256_pd512 (__m256d __A) +{ + return __builtin_ia32_pd512_256pd (__A); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castps256_ps512 (__m256 __A) +{ + return __builtin_ia32_ps512_256ps (__A); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castsi256_si512 (__m256i __A) +{ + return (__m512i)__builtin_ia32_si512_256si ((__v8si)__A); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_zextpd128_pd512 (__m128d __A) +{ + return (__m512d) _mm512_insertf32x4 (_mm512_setzero_ps (), (__m128) __A, 0); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_zextps128_ps512 (__m128 __A) +{ + return _mm512_insertf32x4 (_mm512_setzero_ps (), __A, 0); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_zextsi128_si512 (__m128i __A) +{ + return _mm512_inserti32x4 (_mm512_setzero_si512 (), __A, 0); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_zextpd256_pd512 (__m256d __A) +{ + return _mm512_insertf64x4 (_mm512_setzero_pd (), __A, 0); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_zextps256_ps512 (__m256 __A) +{ + return (__m512) _mm512_insertf64x4 (_mm512_setzero_pd (), (__m256d) __A, 0); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_zextsi256_si512 (__m256i __A) +{ + return _mm512_inserti64x4 (_mm512_setzero_si512 (), __A, 0); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpeq_epu32_mask (__m512i __A, __m512i __B) +{ + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __A, + (__v16si) __B, 0, + (__mmask16) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpeq_epu32_mask (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __A, + (__v16si) __B, 0, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpeq_epu64_mask (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __A, + (__v8di) __B, 0, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpeq_epu64_mask (__m512i __A, __m512i __B) +{ + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __A, + (__v8di) __B, 0, + (__mmask8) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpgt_epu32_mask (__m512i __A, __m512i __B) +{ + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __A, + (__v16si) __B, 6, + (__mmask16) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpgt_epu32_mask (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__mmask16) __builtin_ia32_ucmpd512_mask ((__v16si) __A, + (__v16si) __B, 6, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmpgt_epu64_mask (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __A, + (__v8di) __B, 6, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmpgt_epu64_mask (__m512i __A, __m512i __B) +{ + return (__mmask8) __builtin_ia32_ucmpq512_mask ((__v8di) __A, + (__v8di) __B, 6, + (__mmask8) -1); +} + +#undef __MM512_REDUCE_OP +#define __MM512_REDUCE_OP(op) \ + __v8si __T1 = (__v8si) _mm512_extracti64x4_epi64 (__A, 1); \ + __v8si __T2 = (__v8si) _mm512_extracti64x4_epi64 (__A, 0); \ + __m256i __T3 = (__m256i) (__T1 op __T2); \ + __v4si __T4 = (__v4si) _mm256_extracti128_si256 (__T3, 1); \ + __v4si __T5 = (__v4si) _mm256_extracti128_si256 (__T3, 0); \ + __v4si __T6 = __T4 op __T5; \ + __v4si __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); \ + __v4si __T8 = __T6 op __T7; \ + return __T8[0] op __T8[1] + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_add_epi32 (__m512i __A) +{ + __MM512_REDUCE_OP (+); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_mul_epi32 (__m512i __A) +{ + __MM512_REDUCE_OP (*); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_and_epi32 (__m512i __A) +{ + __MM512_REDUCE_OP (&); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_or_epi32 (__m512i __A) +{ + __MM512_REDUCE_OP (|); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_add_epi32 (__mmask16 __U, __m512i __A) +{ + __A = _mm512_maskz_mov_epi32 (__U, __A); + __MM512_REDUCE_OP (+); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_mul_epi32 (__mmask16 __U, __m512i __A) +{ + __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (1), __U, __A); + __MM512_REDUCE_OP (*); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_and_epi32 (__mmask16 __U, __m512i __A) +{ + __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (~0), __U, __A); + __MM512_REDUCE_OP (&); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_or_epi32 (__mmask16 __U, __m512i __A) +{ + __A = _mm512_maskz_mov_epi32 (__U, __A); + __MM512_REDUCE_OP (|); +} + +#undef __MM512_REDUCE_OP +#define __MM512_REDUCE_OP(op) \ + __m256i __T1 = (__m256i) _mm512_extracti64x4_epi64 (__A, 1); \ + __m256i __T2 = (__m256i) _mm512_extracti64x4_epi64 (__A, 0); \ + __m256i __T3 = _mm256_##op (__T1, __T2); \ + __m128i __T4 = (__m128i) _mm256_extracti128_si256 (__T3, 1); \ + __m128i __T5 = (__m128i) _mm256_extracti128_si256 (__T3, 0); \ + __m128i __T6 = _mm_##op (__T4, __T5); \ + __m128i __T7 = (__m128i) __builtin_shuffle ((__v4si) __T6, \ + (__v4si) { 2, 3, 0, 1 }); \ + __m128i __T8 = _mm_##op (__T6, __T7); \ + __m128i __T9 = (__m128i) __builtin_shuffle ((__v4si) __T8, \ + (__v4si) { 1, 0, 1, 0 }); \ + __v4si __T10 = (__v4si) _mm_##op (__T8, __T9); \ + return __T10[0] + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_min_epi32 (__m512i __A) +{ + __MM512_REDUCE_OP (min_epi32); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_max_epi32 (__m512i __A) +{ + __MM512_REDUCE_OP (max_epi32); +} + +extern __inline unsigned int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_min_epu32 (__m512i __A) +{ + __MM512_REDUCE_OP (min_epu32); +} + +extern __inline unsigned int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_max_epu32 (__m512i __A) +{ + __MM512_REDUCE_OP (max_epu32); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_min_epi32 (__mmask16 __U, __m512i __A) +{ + __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (__INT_MAX__), __U, __A); + __MM512_REDUCE_OP (min_epi32); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_max_epi32 (__mmask16 __U, __m512i __A) +{ + __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (-__INT_MAX__ - 1), __U, __A); + __MM512_REDUCE_OP (max_epi32); +} + +extern __inline unsigned int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_min_epu32 (__mmask16 __U, __m512i __A) +{ + __A = _mm512_mask_mov_epi32 (_mm512_set1_epi32 (~0), __U, __A); + __MM512_REDUCE_OP (min_epu32); +} + +extern __inline unsigned int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_max_epu32 (__mmask16 __U, __m512i __A) +{ + __A = _mm512_maskz_mov_epi32 (__U, __A); + __MM512_REDUCE_OP (max_epu32); +} + +#undef __MM512_REDUCE_OP +#define __MM512_REDUCE_OP(op) \ + __m256 __T1 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 1); \ + __m256 __T2 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 0); \ + __m256 __T3 = __T1 op __T2; \ + __m128 __T4 = _mm256_extractf128_ps (__T3, 1); \ + __m128 __T5 = _mm256_extractf128_ps (__T3, 0); \ + __m128 __T6 = __T4 op __T5; \ + __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); \ + __m128 __T8 = __T6 op __T7; \ + return __T8[0] op __T8[1] + +extern __inline float +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_add_ps (__m512 __A) +{ + __MM512_REDUCE_OP (+); +} + +extern __inline float +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_mul_ps (__m512 __A) +{ + __MM512_REDUCE_OP (*); +} + +extern __inline float +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_add_ps (__mmask16 __U, __m512 __A) +{ + __A = _mm512_maskz_mov_ps (__U, __A); + __MM512_REDUCE_OP (+); +} + +extern __inline float +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_mul_ps (__mmask16 __U, __m512 __A) +{ + __A = _mm512_mask_mov_ps (_mm512_set1_ps (1.0f), __U, __A); + __MM512_REDUCE_OP (*); +} + +#undef __MM512_REDUCE_OP +#define __MM512_REDUCE_OP(op) \ + __m256 __T1 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 1); \ + __m256 __T2 = (__m256) _mm512_extractf64x4_pd ((__m512d) __A, 0); \ + __m256 __T3 = _mm256_##op (__T1, __T2); \ + __m128 __T4 = _mm256_extractf128_ps (__T3, 1); \ + __m128 __T5 = _mm256_extractf128_ps (__T3, 0); \ + __m128 __T6 = _mm_##op (__T4, __T5); \ + __m128 __T7 = __builtin_shuffle (__T6, (__v4si) { 2, 3, 0, 1 }); \ + __m128 __T8 = _mm_##op (__T6, __T7); \ + __m128 __T9 = __builtin_shuffle (__T8, (__v4si) { 1, 0, 1, 0 }); \ + __m128 __T10 = _mm_##op (__T8, __T9); \ + return __T10[0] + +extern __inline float +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_min_ps (__m512 __A) +{ + __MM512_REDUCE_OP (min_ps); +} + +extern __inline float +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_max_ps (__m512 __A) +{ + __MM512_REDUCE_OP (max_ps); +} + +extern __inline float +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_min_ps (__mmask16 __U, __m512 __A) +{ + __A = _mm512_mask_mov_ps (_mm512_set1_ps (__builtin_inff ()), __U, __A); + __MM512_REDUCE_OP (min_ps); +} + +extern __inline float +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_max_ps (__mmask16 __U, __m512 __A) +{ + __A = _mm512_mask_mov_ps (_mm512_set1_ps (-__builtin_inff ()), __U, __A); + __MM512_REDUCE_OP (max_ps); +} + +#undef __MM512_REDUCE_OP +#define __MM512_REDUCE_OP(op) \ + __v4di __T1 = (__v4di) _mm512_extracti64x4_epi64 (__A, 1); \ + __v4di __T2 = (__v4di) _mm512_extracti64x4_epi64 (__A, 0); \ + __m256i __T3 = (__m256i) (__T1 op __T2); \ + __v2di __T4 = (__v2di) _mm256_extracti128_si256 (__T3, 1); \ + __v2di __T5 = (__v2di) _mm256_extracti128_si256 (__T3, 0); \ + __v2di __T6 = __T4 op __T5; \ + return __T6[0] op __T6[1] + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_add_epi64 (__m512i __A) +{ + __MM512_REDUCE_OP (+); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_mul_epi64 (__m512i __A) +{ + __MM512_REDUCE_OP (*); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_and_epi64 (__m512i __A) +{ + __MM512_REDUCE_OP (&); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_or_epi64 (__m512i __A) +{ + __MM512_REDUCE_OP (|); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_add_epi64 (__mmask8 __U, __m512i __A) +{ + __A = _mm512_maskz_mov_epi64 (__U, __A); + __MM512_REDUCE_OP (+); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_mul_epi64 (__mmask8 __U, __m512i __A) +{ + __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (1LL), __U, __A); + __MM512_REDUCE_OP (*); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_and_epi64 (__mmask8 __U, __m512i __A) +{ + __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (~0LL), __U, __A); + __MM512_REDUCE_OP (&); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_or_epi64 (__mmask8 __U, __m512i __A) +{ + __A = _mm512_maskz_mov_epi64 (__U, __A); + __MM512_REDUCE_OP (|); +} + +#undef __MM512_REDUCE_OP +#define __MM512_REDUCE_OP(op) \ + __m512i __T1 = _mm512_shuffle_i64x2 (__A, __A, 0x4e); \ + __m512i __T2 = _mm512_##op (__A, __T1); \ + __m512i __T3 \ + = (__m512i) __builtin_shuffle ((__v8di) __T2, \ + (__v8di) { 2, 3, 0, 1, 6, 7, 4, 5 });\ + __m512i __T4 = _mm512_##op (__T2, __T3); \ + __m512i __T5 \ + = (__m512i) __builtin_shuffle ((__v8di) __T4, \ + (__v8di) { 1, 0, 3, 2, 5, 4, 7, 6 });\ + __v8di __T6 = (__v8di) _mm512_##op (__T4, __T5); \ + return __T6[0] + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_min_epi64 (__m512i __A) +{ + __MM512_REDUCE_OP (min_epi64); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_max_epi64 (__m512i __A) +{ + __MM512_REDUCE_OP (max_epi64); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_min_epi64 (__mmask8 __U, __m512i __A) +{ + __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (__LONG_LONG_MAX__), + __U, __A); + __MM512_REDUCE_OP (min_epi64); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_max_epi64 (__mmask8 __U, __m512i __A) +{ + __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (-__LONG_LONG_MAX__ - 1), + __U, __A); + __MM512_REDUCE_OP (max_epi64); +} + +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_min_epu64 (__m512i __A) +{ + __MM512_REDUCE_OP (min_epu64); +} + +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_max_epu64 (__m512i __A) +{ + __MM512_REDUCE_OP (max_epu64); +} + +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_min_epu64 (__mmask8 __U, __m512i __A) +{ + __A = _mm512_mask_mov_epi64 (_mm512_set1_epi64 (~0LL), __U, __A); + __MM512_REDUCE_OP (min_epu64); +} + +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_max_epu64 (__mmask8 __U, __m512i __A) +{ + __A = _mm512_maskz_mov_epi64 (__U, __A); + __MM512_REDUCE_OP (max_epu64); +} + +#undef __MM512_REDUCE_OP +#define __MM512_REDUCE_OP(op) \ + __m256d __T1 = (__m256d) _mm512_extractf64x4_pd (__A, 1); \ + __m256d __T2 = (__m256d) _mm512_extractf64x4_pd (__A, 0); \ + __m256d __T3 = __T1 op __T2; \ + __m128d __T4 = _mm256_extractf128_pd (__T3, 1); \ + __m128d __T5 = _mm256_extractf128_pd (__T3, 0); \ + __m128d __T6 = __T4 op __T5; \ + return __T6[0] op __T6[1] + +extern __inline double +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_add_pd (__m512d __A) +{ + __MM512_REDUCE_OP (+); +} + +extern __inline double +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_mul_pd (__m512d __A) +{ + __MM512_REDUCE_OP (*); +} + +extern __inline double +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_add_pd (__mmask8 __U, __m512d __A) +{ + __A = _mm512_maskz_mov_pd (__U, __A); + __MM512_REDUCE_OP (+); +} + +extern __inline double +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_mul_pd (__mmask8 __U, __m512d __A) +{ + __A = _mm512_mask_mov_pd (_mm512_set1_pd (1.0), __U, __A); + __MM512_REDUCE_OP (*); +} + +#undef __MM512_REDUCE_OP +#define __MM512_REDUCE_OP(op) \ + __m256d __T1 = (__m256d) _mm512_extractf64x4_pd (__A, 1); \ + __m256d __T2 = (__m256d) _mm512_extractf64x4_pd (__A, 0); \ + __m256d __T3 = _mm256_##op (__T1, __T2); \ + __m128d __T4 = _mm256_extractf128_pd (__T3, 1); \ + __m128d __T5 = _mm256_extractf128_pd (__T3, 0); \ + __m128d __T6 = _mm_##op (__T4, __T5); \ + __m128d __T7 = (__m128d) __builtin_shuffle (__T6, (__v2di) { 1, 0 }); \ + __m128d __T8 = _mm_##op (__T6, __T7); \ + return __T8[0] + +extern __inline double +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_min_pd (__m512d __A) +{ + __MM512_REDUCE_OP (min_pd); +} + +extern __inline double +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_max_pd (__m512d __A) +{ + __MM512_REDUCE_OP (max_pd); +} + +extern __inline double +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_min_pd (__mmask8 __U, __m512d __A) +{ + __A = _mm512_mask_mov_pd (_mm512_set1_pd (__builtin_inf ()), __U, __A); + __MM512_REDUCE_OP (min_pd); +} + +extern __inline double +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_max_pd (__mmask8 __U, __m512d __A) +{ + __A = _mm512_mask_mov_pd (_mm512_set1_pd (-__builtin_inf ()), __U, __A); + __MM512_REDUCE_OP (max_pd); +} + +#undef __MM512_REDUCE_OP + +#ifdef __DISABLE_AVX512F__ +#undef __DISABLE_AVX512F__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512F__ */ + +#endif /* _AVX512FINTRIN_H_INCLUDED */ diff --git a/include-gcc/avx512fp16intrin.h b/include-gcc/avx512fp16intrin.h new file mode 100644 index 0000000..dd083e5 --- /dev/null +++ b/include-gcc/avx512fp16intrin.h @@ -0,0 +1,7219 @@ +/* Copyright (C) 2019-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512FP16INTRIN_H_INCLUDED +#define __AVX512FP16INTRIN_H_INCLUDED + +#ifndef __AVX512FP16__ +#pragma GCC push_options +#pragma GCC target("avx512fp16") +#define __DISABLE_AVX512FP16__ +#endif /* __AVX512FP16__ */ + +/* Internal data types for implementing the intrinsics. */ +typedef _Float16 __v8hf __attribute__ ((__vector_size__ (16))); +typedef _Float16 __v16hf __attribute__ ((__vector_size__ (32))); +typedef _Float16 __v32hf __attribute__ ((__vector_size__ (64))); + +/* The Intel API is flexible enough that we must allow aliasing with other + vector types, and their scalar components. */ +typedef _Float16 __m128h __attribute__ ((__vector_size__ (16), __may_alias__)); +typedef _Float16 __m256h __attribute__ ((__vector_size__ (32), __may_alias__)); +typedef _Float16 __m512h __attribute__ ((__vector_size__ (64), __may_alias__)); + +/* Unaligned version of the same type. */ +typedef _Float16 __m128h_u __attribute__ ((__vector_size__ (16), \ + __may_alias__, __aligned__ (1))); +typedef _Float16 __m256h_u __attribute__ ((__vector_size__ (32), \ + __may_alias__, __aligned__ (1))); +typedef _Float16 __m512h_u __attribute__ ((__vector_size__ (64), \ + __may_alias__, __aligned__ (1))); + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_ph (_Float16 __A7, _Float16 __A6, _Float16 __A5, + _Float16 __A4, _Float16 __A3, _Float16 __A2, + _Float16 __A1, _Float16 __A0) +{ + return __extension__ (__m128h)(__v8hf){ __A0, __A1, __A2, __A3, + __A4, __A5, __A6, __A7 }; +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set_ph (_Float16 __A15, _Float16 __A14, _Float16 __A13, + _Float16 __A12, _Float16 __A11, _Float16 __A10, + _Float16 __A9, _Float16 __A8, _Float16 __A7, + _Float16 __A6, _Float16 __A5, _Float16 __A4, + _Float16 __A3, _Float16 __A2, _Float16 __A1, + _Float16 __A0) +{ + return __extension__ (__m256h)(__v16hf){ __A0, __A1, __A2, __A3, + __A4, __A5, __A6, __A7, + __A8, __A9, __A10, __A11, + __A12, __A13, __A14, __A15 }; +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set_ph (_Float16 __A31, _Float16 __A30, _Float16 __A29, + _Float16 __A28, _Float16 __A27, _Float16 __A26, + _Float16 __A25, _Float16 __A24, _Float16 __A23, + _Float16 __A22, _Float16 __A21, _Float16 __A20, + _Float16 __A19, _Float16 __A18, _Float16 __A17, + _Float16 __A16, _Float16 __A15, _Float16 __A14, + _Float16 __A13, _Float16 __A12, _Float16 __A11, + _Float16 __A10, _Float16 __A9, _Float16 __A8, + _Float16 __A7, _Float16 __A6, _Float16 __A5, + _Float16 __A4, _Float16 __A3, _Float16 __A2, + _Float16 __A1, _Float16 __A0) +{ + return __extension__ (__m512h)(__v32hf){ __A0, __A1, __A2, __A3, + __A4, __A5, __A6, __A7, + __A8, __A9, __A10, __A11, + __A12, __A13, __A14, __A15, + __A16, __A17, __A18, __A19, + __A20, __A21, __A22, __A23, + __A24, __A25, __A26, __A27, + __A28, __A29, __A30, __A31 }; +} + +/* Create vectors of elements in the reversed order from _mm_set_ph, + _mm256_set_ph and _mm512_set_ph functions. */ + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_ph (_Float16 __A0, _Float16 __A1, _Float16 __A2, + _Float16 __A3, _Float16 __A4, _Float16 __A5, + _Float16 __A6, _Float16 __A7) +{ + return _mm_set_ph (__A7, __A6, __A5, __A4, __A3, __A2, __A1, __A0); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setr_ph (_Float16 __A0, _Float16 __A1, _Float16 __A2, + _Float16 __A3, _Float16 __A4, _Float16 __A5, + _Float16 __A6, _Float16 __A7, _Float16 __A8, + _Float16 __A9, _Float16 __A10, _Float16 __A11, + _Float16 __A12, _Float16 __A13, _Float16 __A14, + _Float16 __A15) +{ + return _mm256_set_ph (__A15, __A14, __A13, __A12, __A11, __A10, __A9, + __A8, __A7, __A6, __A5, __A4, __A3, __A2, __A1, + __A0); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_setr_ph (_Float16 __A0, _Float16 __A1, _Float16 __A2, + _Float16 __A3, _Float16 __A4, _Float16 __A5, + _Float16 __A6, _Float16 __A7, _Float16 __A8, + _Float16 __A9, _Float16 __A10, _Float16 __A11, + _Float16 __A12, _Float16 __A13, _Float16 __A14, + _Float16 __A15, _Float16 __A16, _Float16 __A17, + _Float16 __A18, _Float16 __A19, _Float16 __A20, + _Float16 __A21, _Float16 __A22, _Float16 __A23, + _Float16 __A24, _Float16 __A25, _Float16 __A26, + _Float16 __A27, _Float16 __A28, _Float16 __A29, + _Float16 __A30, _Float16 __A31) + +{ + return _mm512_set_ph (__A31, __A30, __A29, __A28, __A27, __A26, __A25, + __A24, __A23, __A22, __A21, __A20, __A19, __A18, + __A17, __A16, __A15, __A14, __A13, __A12, __A11, + __A10, __A9, __A8, __A7, __A6, __A5, __A4, __A3, + __A2, __A1, __A0); +} + +/* Broadcast _Float16 to vector. */ + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_ph (_Float16 __A) +{ + return _mm_set_ph (__A, __A, __A, __A, __A, __A, __A, __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set1_ph (_Float16 __A) +{ + return _mm256_set_ph (__A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set1_ph (_Float16 __A) +{ + return _mm512_set_ph (__A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A); +} + +/* Create a vector with all zeros. */ + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setzero_ph (void) +{ + return _mm_set1_ph (0.0f16); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setzero_ph (void) +{ + return _mm256_set1_ph (0.0f16); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_setzero_ph (void) +{ + return _mm512_set1_ph (0.0f16); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_undefined_ph (void) +{ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Winit-self" + __m128h __Y = __Y; +#pragma GCC diagnostic pop + return __Y; +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_undefined_ph (void) +{ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Winit-self" + __m256h __Y = __Y; +#pragma GCC diagnostic pop + return __Y; +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_undefined_ph (void) +{ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Winit-self" + __m512h __Y = __Y; +#pragma GCC diagnostic pop + return __Y; +} + +extern __inline _Float16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsh_h (__m128h __A) +{ + return __A[0]; +} + +extern __inline _Float16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtsh_h (__m256h __A) +{ + return __A[0]; +} + +extern __inline _Float16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtsh_h (__m512h __A) +{ + return __A[0]; +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castph_ps (__m512h __a) +{ + return (__m512) __a; +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castph_pd (__m512h __a) +{ + return (__m512d) __a; +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castph_si512 (__m512h __a) +{ + return (__m512i) __a; +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castph512_ph128 (__m512h __A) +{ + union + { + __m128h __a[4]; + __m512h __v; + } __u = { .__v = __A }; + return __u.__a[0]; +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castph512_ph256 (__m512h __A) +{ + union + { + __m256h __a[2]; + __m512h __v; + } __u = { .__v = __A }; + return __u.__a[0]; +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castph128_ph512 (__m128h __A) +{ + union + { + __m128h __a[4]; + __m512h __v; + } __u; + __u.__a[0] = __A; + return __u.__v; +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castph256_ph512 (__m256h __A) +{ + union + { + __m256h __a[2]; + __m512h __v; + } __u; + __u.__a[0] = __A; + return __u.__v; +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_zextph128_ph512 (__m128h __A) +{ + return (__m512h) _mm512_insertf32x4 (_mm512_setzero_ps (), + (__m128) __A, 0); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_zextph256_ph512 (__m256h __A) +{ + return (__m512h) _mm512_insertf64x4 (_mm512_setzero_pd (), + (__m256d) __A, 0); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castps_ph (__m512 __a) +{ + return (__m512h) __a; +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castpd_ph (__m512d __a) +{ + return (__m512h) __a; +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_castsi512_ph (__m512i __a) +{ + return (__m512h) __a; +} + +/* Create a vector with element 0 as F and the rest zero. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_sh (_Float16 __F) +{ + return _mm_set_ph (0.0f16, 0.0f16, 0.0f16, 0.0f16, 0.0f16, 0.0f16, 0.0f16, + __F); +} + +/* Create a vector with element 0 as *P and the rest zero. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_sh (void const *__P) +{ + return _mm_set_ph (0.0f16, 0.0f16, 0.0f16, 0.0f16, 0.0f16, 0.0f16, 0.0f16, + *(_Float16 const *) __P); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_load_ph (void const *__P) +{ + return *(const __m512h *) __P; +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_load_ph (void const *__P) +{ + return *(const __m256h *) __P; +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_ph (void const *__P) +{ + return *(const __m128h *) __P; +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_loadu_ph (void const *__P) +{ + return *(const __m512h_u *) __P; +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu_ph (void const *__P) +{ + return *(const __m256h_u *) __P; +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_ph (void const *__P) +{ + return *(const __m128h_u *) __P; +} + +/* Stores the lower _Float16 value. */ +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_sh (void *__P, __m128h __A) +{ + *(_Float16 *) __P = ((__v8hf)__A)[0]; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_store_ph (void *__P, __m512h __A) +{ + *(__m512h *) __P = __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_store_ph (void *__P, __m256h __A) +{ + *(__m256h *) __P = __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_ph (void *__P, __m128h __A) +{ + *(__m128h *) __P = __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_storeu_ph (void *__P, __m512h __A) +{ + *(__m512h_u *) __P = __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_storeu_ph (void *__P, __m256h __A) +{ + *(__m256h_u *) __P = __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_ph (void *__P, __m128h __A) +{ + *(__m128h_u *) __P = __A; +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_abs_ph (__m512h __A) +{ + return (__m512h) _mm512_and_epi32 ( _mm512_set1_epi32 (0x7FFF7FFF), + (__m512i) __A); +} + +/* Intrinsics v[add,sub,mul,div]ph. */ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_add_ph (__m512h __A, __m512h __B) +{ + return (__m512h) ((__v32hf) __A + (__v32hf) __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_add_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D) +{ + return __builtin_ia32_addph512_mask (__C, __D, __A, __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_add_ph (__mmask32 __A, __m512h __B, __m512h __C) +{ + return __builtin_ia32_addph512_mask (__B, __C, + _mm512_setzero_ph (), __A); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sub_ph (__m512h __A, __m512h __B) +{ + return (__m512h) ((__v32hf) __A - (__v32hf) __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sub_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D) +{ + return __builtin_ia32_subph512_mask (__C, __D, __A, __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sub_ph (__mmask32 __A, __m512h __B, __m512h __C) +{ + return __builtin_ia32_subph512_mask (__B, __C, + _mm512_setzero_ph (), __A); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mul_ph (__m512h __A, __m512h __B) +{ + return (__m512h) ((__v32hf) __A * (__v32hf) __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mul_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D) +{ + return __builtin_ia32_mulph512_mask (__C, __D, __A, __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mul_ph (__mmask32 __A, __m512h __B, __m512h __C) +{ + return __builtin_ia32_mulph512_mask (__B, __C, + _mm512_setzero_ph (), __A); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_div_ph (__m512h __A, __m512h __B) +{ + return (__m512h) ((__v32hf) __A / (__v32hf) __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_div_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D) +{ + return __builtin_ia32_divph512_mask (__C, __D, __A, __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_div_ph (__mmask32 __A, __m512h __B, __m512h __C) +{ + return __builtin_ia32_divph512_mask (__B, __C, + _mm512_setzero_ph (), __A); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_add_round_ph (__m512h __A, __m512h __B, const int __C) +{ + return __builtin_ia32_addph512_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, __C); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_add_round_ph (__m512h __A, __mmask32 __B, __m512h __C, + __m512h __D, const int __E) +{ + return __builtin_ia32_addph512_mask_round (__C, __D, __A, __B, __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_add_round_ph (__mmask32 __A, __m512h __B, __m512h __C, + const int __D) +{ + return __builtin_ia32_addph512_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, __D); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sub_round_ph (__m512h __A, __m512h __B, const int __C) +{ + return __builtin_ia32_subph512_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, __C); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sub_round_ph (__m512h __A, __mmask32 __B, __m512h __C, + __m512h __D, const int __E) +{ + return __builtin_ia32_subph512_mask_round (__C, __D, __A, __B, __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sub_round_ph (__mmask32 __A, __m512h __B, __m512h __C, + const int __D) +{ + return __builtin_ia32_subph512_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, __D); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mul_round_ph (__m512h __A, __m512h __B, const int __C) +{ + return __builtin_ia32_mulph512_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, __C); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_mul_round_ph (__m512h __A, __mmask32 __B, __m512h __C, + __m512h __D, const int __E) +{ + return __builtin_ia32_mulph512_mask_round (__C, __D, __A, __B, __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_mul_round_ph (__mmask32 __A, __m512h __B, __m512h __C, + const int __D) +{ + return __builtin_ia32_mulph512_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, __D); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_div_round_ph (__m512h __A, __m512h __B, const int __C) +{ + return __builtin_ia32_divph512_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, __C); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_div_round_ph (__m512h __A, __mmask32 __B, __m512h __C, + __m512h __D, const int __E) +{ + return __builtin_ia32_divph512_mask_round (__C, __D, __A, __B, __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_div_round_ph (__mmask32 __A, __m512h __B, __m512h __C, + const int __D) +{ + return __builtin_ia32_divph512_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, __D); +} +#else +#define _mm512_add_round_ph(A, B, C) \ + ((__m512h)__builtin_ia32_addph512_mask_round((A), (B), \ + _mm512_setzero_ph (), \ + (__mmask32)-1, (C))) + +#define _mm512_mask_add_round_ph(A, B, C, D, E) \ + ((__m512h)__builtin_ia32_addph512_mask_round((C), (D), (A), (B), (E))) + +#define _mm512_maskz_add_round_ph(A, B, C, D) \ + ((__m512h)__builtin_ia32_addph512_mask_round((B), (C), \ + _mm512_setzero_ph (), \ + (A), (D))) + +#define _mm512_sub_round_ph(A, B, C) \ + ((__m512h)__builtin_ia32_subph512_mask_round((A), (B), \ + _mm512_setzero_ph (), \ + (__mmask32)-1, (C))) + +#define _mm512_mask_sub_round_ph(A, B, C, D, E) \ + ((__m512h)__builtin_ia32_subph512_mask_round((C), (D), (A), (B), (E))) + +#define _mm512_maskz_sub_round_ph(A, B, C, D) \ + ((__m512h)__builtin_ia32_subph512_mask_round((B), (C), \ + _mm512_setzero_ph (), \ + (A), (D))) + +#define _mm512_mul_round_ph(A, B, C) \ + ((__m512h)__builtin_ia32_mulph512_mask_round((A), (B), \ + _mm512_setzero_ph (), \ + (__mmask32)-1, (C))) + +#define _mm512_mask_mul_round_ph(A, B, C, D, E) \ + ((__m512h)__builtin_ia32_mulph512_mask_round((C), (D), (A), (B), (E))) + +#define _mm512_maskz_mul_round_ph(A, B, C, D) \ + ((__m512h)__builtin_ia32_mulph512_mask_round((B), (C), \ + _mm512_setzero_ph (), \ + (A), (D))) + +#define _mm512_div_round_ph(A, B, C) \ + ((__m512h)__builtin_ia32_divph512_mask_round((A), (B), \ + _mm512_setzero_ph (), \ + (__mmask32)-1, (C))) + +#define _mm512_mask_div_round_ph(A, B, C, D, E) \ + ((__m512h)__builtin_ia32_divph512_mask_round((C), (D), (A), (B), (E))) + +#define _mm512_maskz_div_round_ph(A, B, C, D) \ + ((__m512h)__builtin_ia32_divph512_mask_round((B), (C), \ + _mm512_setzero_ph (), \ + (A), (D))) +#endif /* __OPTIMIZE__ */ + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_conj_pch (__m512h __A) +{ + return (__m512h) _mm512_xor_epi32 ((__m512i) __A, _mm512_set1_epi32 (1<<31)); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_conj_pch (__m512h __W, __mmask16 __U, __m512h __A) +{ + return (__m512h) + __builtin_ia32_movaps512_mask ((__v16sf) _mm512_conj_pch (__A), + (__v16sf) __W, + (__mmask16) __U); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_conj_pch (__mmask16 __U, __m512h __A) +{ + return (__m512h) + __builtin_ia32_movaps512_mask ((__v16sf) _mm512_conj_pch (__A), + (__v16sf) _mm512_setzero_ps (), + (__mmask16) __U); +} + +/* Intrinsics of v[add,sub,mul,div]sh. */ +extern __inline __m128h + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_sh (__m128h __A, __m128h __B) +{ + __A[0] += __B[0]; + return __A; +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_addsh_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_sh (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_addsh_mask (__B, __C, _mm_setzero_ph (), + __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_sh (__m128h __A, __m128h __B) +{ + __A[0] -= __B[0]; + return __A; +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_subsh_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_sh (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_subsh_mask (__B, __C, _mm_setzero_ph (), + __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_sh (__m128h __A, __m128h __B) +{ + __A[0] *= __B[0]; + return __A; +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mul_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_mulsh_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mul_sh (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_mulsh_mask (__B, __C, _mm_setzero_ph (), __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_div_sh (__m128h __A, __m128h __B) +{ + __A[0] /= __B[0]; + return __A; +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_div_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_divsh_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_div_sh (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_divsh_mask (__B, __C, _mm_setzero_ph (), + __A); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_round_sh (__m128h __A, __m128h __B, const int __C) +{ + return __builtin_ia32_addsh_mask_round (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1, __C); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_round_sh (__m128h __A, __mmask8 __B, __m128h __C, + __m128h __D, const int __E) +{ + return __builtin_ia32_addsh_mask_round (__C, __D, __A, __B, __E); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_round_sh (__mmask8 __A, __m128h __B, __m128h __C, + const int __D) +{ + return __builtin_ia32_addsh_mask_round (__B, __C, + _mm_setzero_ph (), + __A, __D); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_round_sh (__m128h __A, __m128h __B, const int __C) +{ + return __builtin_ia32_subsh_mask_round (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1, __C); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_round_sh (__m128h __A, __mmask8 __B, __m128h __C, + __m128h __D, const int __E) +{ + return __builtin_ia32_subsh_mask_round (__C, __D, __A, __B, __E); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_round_sh (__mmask8 __A, __m128h __B, __m128h __C, + const int __D) +{ + return __builtin_ia32_subsh_mask_round (__B, __C, + _mm_setzero_ph (), + __A, __D); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_round_sh (__m128h __A, __m128h __B, const int __C) +{ + return __builtin_ia32_mulsh_mask_round (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1, __C); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mul_round_sh (__m128h __A, __mmask8 __B, __m128h __C, + __m128h __D, const int __E) +{ + return __builtin_ia32_mulsh_mask_round (__C, __D, __A, __B, __E); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mul_round_sh (__mmask8 __A, __m128h __B, __m128h __C, + const int __D) +{ + return __builtin_ia32_mulsh_mask_round (__B, __C, + _mm_setzero_ph (), + __A, __D); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_div_round_sh (__m128h __A, __m128h __B, const int __C) +{ + return __builtin_ia32_divsh_mask_round (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1, __C); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_div_round_sh (__m128h __A, __mmask8 __B, __m128h __C, + __m128h __D, const int __E) +{ + return __builtin_ia32_divsh_mask_round (__C, __D, __A, __B, __E); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_div_round_sh (__mmask8 __A, __m128h __B, __m128h __C, + const int __D) +{ + return __builtin_ia32_divsh_mask_round (__B, __C, + _mm_setzero_ph (), + __A, __D); +} +#else +#define _mm_add_round_sh(A, B, C) \ + ((__m128h)__builtin_ia32_addsh_mask_round ((A), (B), \ + _mm_setzero_ph (), \ + (__mmask8)-1, (C))) + +#define _mm_mask_add_round_sh(A, B, C, D, E) \ + ((__m128h)__builtin_ia32_addsh_mask_round ((C), (D), (A), (B), (E))) + +#define _mm_maskz_add_round_sh(A, B, C, D) \ + ((__m128h)__builtin_ia32_addsh_mask_round ((B), (C), \ + _mm_setzero_ph (), \ + (A), (D))) + +#define _mm_sub_round_sh(A, B, C) \ + ((__m128h)__builtin_ia32_subsh_mask_round ((A), (B), \ + _mm_setzero_ph (), \ + (__mmask8)-1, (C))) + +#define _mm_mask_sub_round_sh(A, B, C, D, E) \ + ((__m128h)__builtin_ia32_subsh_mask_round ((C), (D), (A), (B), (E))) + +#define _mm_maskz_sub_round_sh(A, B, C, D) \ + ((__m128h)__builtin_ia32_subsh_mask_round ((B), (C), \ + _mm_setzero_ph (), \ + (A), (D))) + +#define _mm_mul_round_sh(A, B, C) \ + ((__m128h)__builtin_ia32_mulsh_mask_round ((A), (B), \ + _mm_setzero_ph (), \ + (__mmask8)-1, (C))) + +#define _mm_mask_mul_round_sh(A, B, C, D, E) \ + ((__m128h)__builtin_ia32_mulsh_mask_round ((C), (D), (A), (B), (E))) + +#define _mm_maskz_mul_round_sh(A, B, C, D) \ + ((__m128h)__builtin_ia32_mulsh_mask_round ((B), (C), \ + _mm_setzero_ph (), \ + (A), (D))) + +#define _mm_div_round_sh(A, B, C) \ + ((__m128h)__builtin_ia32_divsh_mask_round ((A), (B), \ + _mm_setzero_ph (), \ + (__mmask8)-1, (C))) + +#define _mm_mask_div_round_sh(A, B, C, D, E) \ + ((__m128h)__builtin_ia32_divsh_mask_round ((C), (D), (A), (B), (E))) + +#define _mm_maskz_div_round_sh(A, B, C, D) \ + ((__m128h)__builtin_ia32_divsh_mask_round ((B), (C), \ + _mm_setzero_ph (), \ + (A), (D))) +#endif /* __OPTIMIZE__ */ + +/* Intrinsic vmaxph vminph. */ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_ph (__m512h __A, __m512h __B) +{ + return __builtin_ia32_maxph512_mask (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D) +{ + return __builtin_ia32_maxph512_mask (__C, __D, __A, __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_ph (__mmask32 __A, __m512h __B, __m512h __C) +{ + return __builtin_ia32_maxph512_mask (__B, __C, + _mm512_setzero_ph (), __A); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_ph (__m512h __A, __m512h __B) +{ + return __builtin_ia32_minph512_mask (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D) +{ + return __builtin_ia32_minph512_mask (__C, __D, __A, __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_ph (__mmask32 __A, __m512h __B, __m512h __C) +{ + return __builtin_ia32_minph512_mask (__B, __C, + _mm512_setzero_ph (), __A); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_max_round_ph (__m512h __A, __m512h __B, const int __C) +{ + return __builtin_ia32_maxph512_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, __C); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_max_round_ph (__m512h __A, __mmask32 __B, __m512h __C, + __m512h __D, const int __E) +{ + return __builtin_ia32_maxph512_mask_round (__C, __D, __A, __B, __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_max_round_ph (__mmask32 __A, __m512h __B, __m512h __C, + const int __D) +{ + return __builtin_ia32_maxph512_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, __D); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_min_round_ph (__m512h __A, __m512h __B, const int __C) +{ + return __builtin_ia32_minph512_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, __C); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_min_round_ph (__m512h __A, __mmask32 __B, __m512h __C, + __m512h __D, const int __E) +{ + return __builtin_ia32_minph512_mask_round (__C, __D, __A, __B, __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_min_round_ph (__mmask32 __A, __m512h __B, __m512h __C, + const int __D) +{ + return __builtin_ia32_minph512_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, __D); +} + +#else +#define _mm512_max_round_ph(A, B, C) \ + (__builtin_ia32_maxph512_mask_round ((A), (B), \ + _mm512_setzero_ph (), \ + (__mmask32)-1, (C))) + +#define _mm512_mask_max_round_ph(A, B, C, D, E) \ + (__builtin_ia32_maxph512_mask_round ((C), (D), (A), (B), (E))) + +#define _mm512_maskz_max_round_ph(A, B, C, D) \ + (__builtin_ia32_maxph512_mask_round ((B), (C), \ + _mm512_setzero_ph (), \ + (A), (D))) + +#define _mm512_min_round_ph(A, B, C) \ + (__builtin_ia32_minph512_mask_round ((A), (B), \ + _mm512_setzero_ph (), \ + (__mmask32)-1, (C))) + +#define _mm512_mask_min_round_ph(A, B, C, D, E) \ + (__builtin_ia32_minph512_mask_round ((C), (D), (A), (B), (E))) + +#define _mm512_maskz_min_round_ph(A, B, C, D) \ + (__builtin_ia32_minph512_mask_round ((B), (C), \ + _mm512_setzero_ph (), \ + (A), (D))) +#endif /* __OPTIMIZE__ */ + +/* Intrinsic vmaxsh vminsh. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_sh (__m128h __A, __m128h __B) +{ + __A[0] = __A[0] > __B[0] ? __A[0] : __B[0]; + return __A; +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_maxsh_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_sh (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_maxsh_mask (__B, __C, _mm_setzero_ph (), + __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_sh (__m128h __A, __m128h __B) +{ + __A[0] = __A[0] < __B[0] ? __A[0] : __B[0]; + return __A; +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_minsh_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_sh (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_minsh_mask (__B, __C, _mm_setzero_ph (), + __A); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_round_sh (__m128h __A, __m128h __B, const int __C) +{ + return __builtin_ia32_maxsh_mask_round (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1, __C); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_round_sh (__m128h __A, __mmask8 __B, __m128h __C, + __m128h __D, const int __E) +{ + return __builtin_ia32_maxsh_mask_round (__C, __D, __A, __B, __E); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_round_sh (__mmask8 __A, __m128h __B, __m128h __C, + const int __D) +{ + return __builtin_ia32_maxsh_mask_round (__B, __C, + _mm_setzero_ph (), + __A, __D); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_round_sh (__m128h __A, __m128h __B, const int __C) +{ + return __builtin_ia32_minsh_mask_round (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1, __C); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_round_sh (__m128h __A, __mmask8 __B, __m128h __C, + __m128h __D, const int __E) +{ + return __builtin_ia32_minsh_mask_round (__C, __D, __A, __B, __E); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_round_sh (__mmask8 __A, __m128h __B, __m128h __C, + const int __D) +{ + return __builtin_ia32_minsh_mask_round (__B, __C, + _mm_setzero_ph (), + __A, __D); +} + +#else +#define _mm_max_round_sh(A, B, C) \ + (__builtin_ia32_maxsh_mask_round ((A), (B), \ + _mm_setzero_ph (), \ + (__mmask8)-1, (C))) + +#define _mm_mask_max_round_sh(A, B, C, D, E) \ + (__builtin_ia32_maxsh_mask_round ((C), (D), (A), (B), (E))) + +#define _mm_maskz_max_round_sh(A, B, C, D) \ + (__builtin_ia32_maxsh_mask_round ((B), (C), \ + _mm_setzero_ph (), \ + (A), (D))) + +#define _mm_min_round_sh(A, B, C) \ + (__builtin_ia32_minsh_mask_round ((A), (B), \ + _mm_setzero_ph (), \ + (__mmask8)-1, (C))) + +#define _mm_mask_min_round_sh(A, B, C, D, E) \ + (__builtin_ia32_minsh_mask_round ((C), (D), (A), (B), (E))) + +#define _mm_maskz_min_round_sh(A, B, C, D) \ + (__builtin_ia32_minsh_mask_round ((B), (C), \ + _mm_setzero_ph (), \ + (A), (D))) + +#endif /* __OPTIMIZE__ */ + +/* vcmpph */ +#ifdef __OPTIMIZE +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_ph_mask (__m512h __A, __m512h __B, const int __C) +{ + return (__mmask32) __builtin_ia32_cmpph512_mask (__A, __B, __C, + (__mmask32) -1); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_ph_mask (__mmask32 __A, __m512h __B, __m512h __C, + const int __D) +{ + return (__mmask32) __builtin_ia32_cmpph512_mask (__B, __C, __D, + __A); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cmp_round_ph_mask (__m512h __A, __m512h __B, const int __C, + const int __D) +{ + return (__mmask32) __builtin_ia32_cmpph512_mask_round (__A, __B, + __C, (__mmask32) -1, + __D); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cmp_round_ph_mask (__mmask32 __A, __m512h __B, __m512h __C, + const int __D, const int __E) +{ + return (__mmask32) __builtin_ia32_cmpph512_mask_round (__B, __C, + __D, __A, + __E); +} + +#else +#define _mm512_cmp_ph_mask(A, B, C) \ + (__builtin_ia32_cmpph512_mask ((A), (B), (C), (-1))) + +#define _mm512_mask_cmp_ph_mask(A, B, C, D) \ + (__builtin_ia32_cmpph512_mask ((B), (C), (D), (A))) + +#define _mm512_cmp_round_ph_mask(A, B, C, D) \ + (__builtin_ia32_cmpph512_mask_round ((A), (B), (C), (-1), (D))) + +#define _mm512_mask_cmp_round_ph_mask(A, B, C, D, E) \ + (__builtin_ia32_cmpph512_mask_round ((B), (C), (D), (A), (E))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcmpsh. */ +#ifdef __OPTIMIZE__ +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_sh_mask (__m128h __A, __m128h __B, const int __C) +{ + return (__mmask8) + __builtin_ia32_cmpsh_mask_round (__A, __B, + __C, (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_sh_mask (__mmask8 __A, __m128h __B, __m128h __C, + const int __D) +{ + return (__mmask8) + __builtin_ia32_cmpsh_mask_round (__B, __C, + __D, __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_round_sh_mask (__m128h __A, __m128h __B, const int __C, + const int __D) +{ + return (__mmask8) __builtin_ia32_cmpsh_mask_round (__A, __B, + __C, (__mmask8) -1, + __D); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_round_sh_mask (__mmask8 __A, __m128h __B, __m128h __C, + const int __D, const int __E) +{ + return (__mmask8) __builtin_ia32_cmpsh_mask_round (__B, __C, + __D, __A, + __E); +} + +#else +#define _mm_cmp_sh_mask(A, B, C) \ + (__builtin_ia32_cmpsh_mask_round ((A), (B), (C), (-1), \ + (_MM_FROUND_CUR_DIRECTION))) + +#define _mm_mask_cmp_sh_mask(A, B, C, D) \ + (__builtin_ia32_cmpsh_mask_round ((B), (C), (D), (A), \ + (_MM_FROUND_CUR_DIRECTION))) + +#define _mm_cmp_round_sh_mask(A, B, C, D) \ + (__builtin_ia32_cmpsh_mask_round ((A), (B), (C), (-1), (D))) + +#define _mm_mask_cmp_round_sh_mask(A, B, C, D, E) \ + (__builtin_ia32_cmpsh_mask_round ((B), (C), (D), (A), (E))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcomish. */ +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comieq_sh (__m128h __A, __m128h __B) +{ + return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_EQ_OS, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comilt_sh (__m128h __A, __m128h __B) +{ + return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_LT_OS, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comile_sh (__m128h __A, __m128h __B) +{ + return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_LE_OS, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comigt_sh (__m128h __A, __m128h __B) +{ + return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_GT_OS, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comige_sh (__m128h __A, __m128h __B) +{ + return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_GE_OS, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comineq_sh (__m128h __A, __m128h __B) +{ + return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_NEQ_US, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomieq_sh (__m128h __A, __m128h __B) +{ + return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_EQ_OQ, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomilt_sh (__m128h __A, __m128h __B) +{ + return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_LT_OQ, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomile_sh (__m128h __A, __m128h __B) +{ + return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_LE_OQ, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomigt_sh (__m128h __A, __m128h __B) +{ + return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_GT_OQ, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomige_sh (__m128h __A, __m128h __B) +{ + return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_GE_OQ, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomineq_sh (__m128h __A, __m128h __B) +{ + return __builtin_ia32_cmpsh_mask_round (__A, __B, _CMP_NEQ_UQ, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comi_sh (__m128h __A, __m128h __B, const int __P) +{ + return __builtin_ia32_cmpsh_mask_round (__A, __B, __P, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comi_round_sh (__m128h __A, __m128h __B, const int __P, const int __R) +{ + return __builtin_ia32_cmpsh_mask_round (__A, __B, __P, + (__mmask8) -1,__R); +} + +#else +#define _mm_comi_round_sh(A, B, P, R) \ + (__builtin_ia32_cmpsh_mask_round ((A), (B), (P), (__mmask8) (-1), (R))) +#define _mm_comi_sh(A, B, P) \ + (__builtin_ia32_cmpsh_mask_round ((A), (B), (P), (__mmask8) (-1), \ + _MM_FROUND_CUR_DIRECTION)) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vsqrtph. */ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sqrt_ph (__m512h __A) +{ + return __builtin_ia32_sqrtph512_mask_round (__A, + _mm512_setzero_ph(), + (__mmask32) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sqrt_ph (__m512h __A, __mmask32 __B, __m512h __C) +{ + return __builtin_ia32_sqrtph512_mask_round (__C, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sqrt_ph (__mmask32 __A, __m512h __B) +{ + return __builtin_ia32_sqrtph512_mask_round (__B, + _mm512_setzero_ph (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_sqrt_round_ph (__m512h __A, const int __B) +{ + return __builtin_ia32_sqrtph512_mask_round (__A, + _mm512_setzero_ph(), + (__mmask32) -1, __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_sqrt_round_ph (__m512h __A, __mmask32 __B, __m512h __C, + const int __D) +{ + return __builtin_ia32_sqrtph512_mask_round (__C, __A, __B, __D); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_sqrt_round_ph (__mmask32 __A, __m512h __B, const int __C) +{ + return __builtin_ia32_sqrtph512_mask_round (__B, + _mm512_setzero_ph (), + __A, __C); +} + +#else +#define _mm512_sqrt_round_ph(A, B) \ + (__builtin_ia32_sqrtph512_mask_round ((A), \ + _mm512_setzero_ph (), \ + (__mmask32)-1, (B))) + +#define _mm512_mask_sqrt_round_ph(A, B, C, D) \ + (__builtin_ia32_sqrtph512_mask_round ((C), (A), (B), (D))) + +#define _mm512_maskz_sqrt_round_ph(A, B, C) \ + (__builtin_ia32_sqrtph512_mask_round ((B), \ + _mm512_setzero_ph (), \ + (A), (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vrsqrtph. */ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rsqrt_ph (__m512h __A) +{ + return __builtin_ia32_rsqrtph512_mask (__A, _mm512_setzero_ph (), + (__mmask32) -1); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rsqrt_ph (__m512h __A, __mmask32 __B, __m512h __C) +{ + return __builtin_ia32_rsqrtph512_mask (__C, __A, __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rsqrt_ph (__mmask32 __A, __m512h __B) +{ + return __builtin_ia32_rsqrtph512_mask (__B, _mm512_setzero_ph (), + __A); +} + +/* Intrinsics vrsqrtsh. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rsqrt_sh (__m128h __A, __m128h __B) +{ + return __builtin_ia32_rsqrtsh_mask (__B, __A, _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rsqrt_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_rsqrtsh_mask (__D, __C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rsqrt_sh (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_rsqrtsh_mask (__C, __B, _mm_setzero_ph (), + __A); +} + +/* Intrinsics vsqrtsh. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sqrt_sh (__m128h __A, __m128h __B) +{ + return __builtin_ia32_sqrtsh_mask_round (__B, __A, + _mm_setzero_ph (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sqrt_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_sqrtsh_mask_round (__D, __C, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sqrt_sh (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_sqrtsh_mask_round (__C, __B, + _mm_setzero_ph (), + __A, _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sqrt_round_sh (__m128h __A, __m128h __B, const int __C) +{ + return __builtin_ia32_sqrtsh_mask_round (__B, __A, + _mm_setzero_ph (), + (__mmask8) -1, __C); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sqrt_round_sh (__m128h __A, __mmask8 __B, __m128h __C, + __m128h __D, const int __E) +{ + return __builtin_ia32_sqrtsh_mask_round (__D, __C, __A, __B, + __E); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sqrt_round_sh (__mmask8 __A, __m128h __B, __m128h __C, + const int __D) +{ + return __builtin_ia32_sqrtsh_mask_round (__C, __B, + _mm_setzero_ph (), + __A, __D); +} + +#else +#define _mm_sqrt_round_sh(A, B, C) \ + (__builtin_ia32_sqrtsh_mask_round ((B), (A), \ + _mm_setzero_ph (), \ + (__mmask8)-1, (C))) + +#define _mm_mask_sqrt_round_sh(A, B, C, D, E) \ + (__builtin_ia32_sqrtsh_mask_round ((D), (C), (A), (B), (E))) + +#define _mm_maskz_sqrt_round_sh(A, B, C, D) \ + (__builtin_ia32_sqrtsh_mask_round ((C), (B), \ + _mm_setzero_ph (), \ + (A), (D))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vrcpph. */ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_rcp_ph (__m512h __A) +{ + return __builtin_ia32_rcpph512_mask (__A, _mm512_setzero_ph (), + (__mmask32) -1); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_rcp_ph (__m512h __A, __mmask32 __B, __m512h __C) +{ + return __builtin_ia32_rcpph512_mask (__C, __A, __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_rcp_ph (__mmask32 __A, __m512h __B) +{ + return __builtin_ia32_rcpph512_mask (__B, _mm512_setzero_ph (), + __A); +} + +/* Intrinsics vrcpsh. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rcp_sh (__m128h __A, __m128h __B) +{ + return __builtin_ia32_rcpsh_mask (__B, __A, _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rcp_sh (__m128h __A, __mmask32 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_rcpsh_mask (__D, __C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rcp_sh (__mmask32 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_rcpsh_mask (__C, __B, _mm_setzero_ph (), + __A); +} + +/* Intrinsics vscalefph. */ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_scalef_ph (__m512h __A, __m512h __B) +{ + return __builtin_ia32_scalefph512_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_scalef_ph (__m512h __A, __mmask32 __B, __m512h __C, __m512h __D) +{ + return __builtin_ia32_scalefph512_mask_round (__C, __D, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_scalef_ph (__mmask32 __A, __m512h __B, __m512h __C) +{ + return __builtin_ia32_scalefph512_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_scalef_round_ph (__m512h __A, __m512h __B, const int __C) +{ + return __builtin_ia32_scalefph512_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, __C); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_scalef_round_ph (__m512h __A, __mmask32 __B, __m512h __C, + __m512h __D, const int __E) +{ + return __builtin_ia32_scalefph512_mask_round (__C, __D, __A, __B, + __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_scalef_round_ph (__mmask32 __A, __m512h __B, __m512h __C, + const int __D) +{ + return __builtin_ia32_scalefph512_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, __D); +} + +#else +#define _mm512_scalef_round_ph(A, B, C) \ + (__builtin_ia32_scalefph512_mask_round ((A), (B), \ + _mm512_setzero_ph (), \ + (__mmask32)-1, (C))) + +#define _mm512_mask_scalef_round_ph(A, B, C, D, E) \ + (__builtin_ia32_scalefph512_mask_round ((C), (D), (A), (B), (E))) + +#define _mm512_maskz_scalef_round_ph(A, B, C, D) \ + (__builtin_ia32_scalefph512_mask_round ((B), (C), \ + _mm512_setzero_ph (), \ + (A), (D))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vscalefsh. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_scalef_sh (__m128h __A, __m128h __B) +{ + return __builtin_ia32_scalefsh_mask_round (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_scalef_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_scalefsh_mask_round (__C, __D, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_scalef_sh (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_scalefsh_mask_round (__B, __C, + _mm_setzero_ph (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_scalef_round_sh (__m128h __A, __m128h __B, const int __C) +{ + return __builtin_ia32_scalefsh_mask_round (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1, __C); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_scalef_round_sh (__m128h __A, __mmask8 __B, __m128h __C, + __m128h __D, const int __E) +{ + return __builtin_ia32_scalefsh_mask_round (__C, __D, __A, __B, + __E); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_scalef_round_sh (__mmask8 __A, __m128h __B, __m128h __C, + const int __D) +{ + return __builtin_ia32_scalefsh_mask_round (__B, __C, + _mm_setzero_ph (), + __A, __D); +} + +#else +#define _mm_scalef_round_sh(A, B, C) \ + (__builtin_ia32_scalefsh_mask_round ((A), (B), \ + _mm_setzero_ph (), \ + (__mmask8)-1, (C))) + +#define _mm_mask_scalef_round_sh(A, B, C, D, E) \ + (__builtin_ia32_scalefsh_mask_round ((C), (D), (A), (B), (E))) + +#define _mm_maskz_scalef_round_sh(A, B, C, D) \ + (__builtin_ia32_scalefsh_mask_round ((B), (C), _mm_setzero_ph (), \ + (A), (D))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vreduceph. */ +#ifdef __OPTIMIZE__ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_ph (__m512h __A, int __B) +{ + return __builtin_ia32_reduceph512_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_ph (__m512h __A, __mmask32 __B, __m512h __C, int __D) +{ + return __builtin_ia32_reduceph512_mask_round (__C, __D, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_reduce_ph (__mmask32 __A, __m512h __B, int __C) +{ + return __builtin_ia32_reduceph512_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_round_ph (__m512h __A, int __B, const int __C) +{ + return __builtin_ia32_reduceph512_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, __C); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_reduce_round_ph (__m512h __A, __mmask32 __B, __m512h __C, + int __D, const int __E) +{ + return __builtin_ia32_reduceph512_mask_round (__C, __D, __A, __B, + __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_reduce_round_ph (__mmask32 __A, __m512h __B, int __C, + const int __D) +{ + return __builtin_ia32_reduceph512_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, __D); +} + +#else +#define _mm512_reduce_ph(A, B) \ + (__builtin_ia32_reduceph512_mask_round ((A), (B), \ + _mm512_setzero_ph (), \ + (__mmask32)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_reduce_ph(A, B, C, D) \ + (__builtin_ia32_reduceph512_mask_round ((C), (D), (A), (B), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_reduce_ph(A, B, C) \ + (__builtin_ia32_reduceph512_mask_round ((B), (C), \ + _mm512_setzero_ph (), \ + (A), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_reduce_round_ph(A, B, C) \ + (__builtin_ia32_reduceph512_mask_round ((A), (B), \ + _mm512_setzero_ph (), \ + (__mmask32)-1, (C))) + +#define _mm512_mask_reduce_round_ph(A, B, C, D, E) \ + (__builtin_ia32_reduceph512_mask_round ((C), (D), (A), (B), (E))) + +#define _mm512_maskz_reduce_round_ph(A, B, C, D) \ + (__builtin_ia32_reduceph512_mask_round ((B), (C), \ + _mm512_setzero_ph (), \ + (A), (D))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vreducesh. */ +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_sh (__m128h __A, __m128h __B, int __C) +{ + return __builtin_ia32_reducesh_mask_round (__A, __B, __C, + _mm_setzero_ph (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_sh (__m128h __A, __mmask8 __B, __m128h __C, + __m128h __D, int __E) +{ + return __builtin_ia32_reducesh_mask_round (__C, __D, __E, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_reduce_sh (__mmask8 __A, __m128h __B, __m128h __C, int __D) +{ + return __builtin_ia32_reducesh_mask_round (__B, __C, __D, + _mm_setzero_ph (), __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_round_sh (__m128h __A, __m128h __B, int __C, const int __D) +{ + return __builtin_ia32_reducesh_mask_round (__A, __B, __C, + _mm_setzero_ph (), + (__mmask8) -1, __D); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_round_sh (__m128h __A, __mmask8 __B, __m128h __C, + __m128h __D, int __E, const int __F) +{ + return __builtin_ia32_reducesh_mask_round (__C, __D, __E, __A, + __B, __F); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_reduce_round_sh (__mmask8 __A, __m128h __B, __m128h __C, + int __D, const int __E) +{ + return __builtin_ia32_reducesh_mask_round (__B, __C, __D, + _mm_setzero_ph (), + __A, __E); +} + +#else +#define _mm_reduce_sh(A, B, C) \ + (__builtin_ia32_reducesh_mask_round ((A), (B), (C), \ + _mm_setzero_ph (), \ + (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_reduce_sh(A, B, C, D, E) \ + (__builtin_ia32_reducesh_mask_round ((C), (D), (E), (A), (B), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_reduce_sh(A, B, C, D) \ + (__builtin_ia32_reducesh_mask_round ((B), (C), (D), \ + _mm_setzero_ph (), \ + (A), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_reduce_round_sh(A, B, C, D) \ + (__builtin_ia32_reducesh_mask_round ((A), (B), (C), \ + _mm_setzero_ph (), \ + (__mmask8)-1, (D))) + +#define _mm_mask_reduce_round_sh(A, B, C, D, E, F) \ + (__builtin_ia32_reducesh_mask_round ((C), (D), (E), (A), (B), (F))) + +#define _mm_maskz_reduce_round_sh(A, B, C, D, E) \ + (__builtin_ia32_reducesh_mask_round ((B), (C), (D), \ + _mm_setzero_ph (), \ + (A), (E))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vrndscaleph. */ +#ifdef __OPTIMIZE__ +extern __inline __m512h + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_roundscale_ph (__m512h __A, int __B) +{ + return __builtin_ia32_rndscaleph512_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_roundscale_ph (__m512h __A, __mmask32 __B, + __m512h __C, int __D) +{ + return __builtin_ia32_rndscaleph512_mask_round (__C, __D, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_roundscale_ph (__mmask32 __A, __m512h __B, int __C) +{ + return __builtin_ia32_rndscaleph512_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_roundscale_round_ph (__m512h __A, int __B, const int __C) +{ + return __builtin_ia32_rndscaleph512_mask_round (__A, __B, + _mm512_setzero_ph (), + (__mmask32) -1, + __C); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_roundscale_round_ph (__m512h __A, __mmask32 __B, + __m512h __C, int __D, const int __E) +{ + return __builtin_ia32_rndscaleph512_mask_round (__C, __D, __A, + __B, __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_roundscale_round_ph (__mmask32 __A, __m512h __B, int __C, + const int __D) +{ + return __builtin_ia32_rndscaleph512_mask_round (__B, __C, + _mm512_setzero_ph (), + __A, __D); +} + +#else +#define _mm512_roundscale_ph(A, B) \ + (__builtin_ia32_rndscaleph512_mask_round ((A), (B), \ + _mm512_setzero_ph (), \ + (__mmask32)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_roundscale_ph(A, B, C, D) \ + (__builtin_ia32_rndscaleph512_mask_round ((C), (D), (A), (B), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_roundscale_ph(A, B, C) \ + (__builtin_ia32_rndscaleph512_mask_round ((B), (C), \ + _mm512_setzero_ph (), \ + (A), \ + _MM_FROUND_CUR_DIRECTION)) +#define _mm512_roundscale_round_ph(A, B, C) \ + (__builtin_ia32_rndscaleph512_mask_round ((A), (B), \ + _mm512_setzero_ph (), \ + (__mmask32)-1, (C))) + +#define _mm512_mask_roundscale_round_ph(A, B, C, D, E) \ + (__builtin_ia32_rndscaleph512_mask_round ((C), (D), (A), (B), (E))) + +#define _mm512_maskz_roundscale_round_ph(A, B, C, D) \ + (__builtin_ia32_rndscaleph512_mask_round ((B), (C), \ + _mm512_setzero_ph (), \ + (A), (D))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vrndscalesh. */ +#ifdef __OPTIMIZE__ +extern __inline __m128h + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roundscale_sh (__m128h __A, __m128h __B, int __C) +{ + return __builtin_ia32_rndscalesh_mask_round (__A, __B, __C, + _mm_setzero_ph (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_roundscale_sh (__m128h __A, __mmask8 __B, __m128h __C, + __m128h __D, int __E) +{ + return __builtin_ia32_rndscalesh_mask_round (__C, __D, __E, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_roundscale_sh (__mmask8 __A, __m128h __B, __m128h __C, int __D) +{ + return __builtin_ia32_rndscalesh_mask_round (__B, __C, __D, + _mm_setzero_ph (), __A, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roundscale_round_sh (__m128h __A, __m128h __B, int __C, const int __D) +{ + return __builtin_ia32_rndscalesh_mask_round (__A, __B, __C, + _mm_setzero_ph (), + (__mmask8) -1, + __D); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_roundscale_round_sh (__m128h __A, __mmask8 __B, __m128h __C, + __m128h __D, int __E, const int __F) +{ + return __builtin_ia32_rndscalesh_mask_round (__C, __D, __E, + __A, __B, __F); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_roundscale_round_sh (__mmask8 __A, __m128h __B, __m128h __C, + int __D, const int __E) +{ + return __builtin_ia32_rndscalesh_mask_round (__B, __C, __D, + _mm_setzero_ph (), + __A, __E); +} + +#else +#define _mm_roundscale_sh(A, B, C) \ + (__builtin_ia32_rndscalesh_mask_round ((A), (B), (C), \ + _mm_setzero_ph (), \ + (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_roundscale_sh(A, B, C, D, E) \ + (__builtin_ia32_rndscalesh_mask_round ((C), (D), (E), (A), (B), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_roundscale_sh(A, B, C, D) \ + (__builtin_ia32_rndscalesh_mask_round ((B), (C), (D), \ + _mm_setzero_ph (), \ + (A), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_roundscale_round_sh(A, B, C, D) \ + (__builtin_ia32_rndscalesh_mask_round ((A), (B), (C), \ + _mm_setzero_ph (), \ + (__mmask8)-1, (D))) + +#define _mm_mask_roundscale_round_sh(A, B, C, D, E, F) \ + (__builtin_ia32_rndscalesh_mask_round ((C), (D), (E), (A), (B), (F))) + +#define _mm_maskz_roundscale_round_sh(A, B, C, D, E) \ + (__builtin_ia32_rndscalesh_mask_round ((B), (C), (D), \ + _mm_setzero_ph (), \ + (A), (E))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vfpclasssh. */ +#ifdef __OPTIMIZE__ +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fpclass_sh_mask (__m128h __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclasssh_mask ((__v8hf) __A, __imm, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fpclass_sh_mask (__mmask8 __U, __m128h __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclasssh_mask ((__v8hf) __A, __imm, __U); +} + +#else +#define _mm_fpclass_sh_mask(X, C) \ + ((__mmask8) __builtin_ia32_fpclasssh_mask ((__v8hf) (__m128h) (X), \ + (int) (C), (__mmask8) (-1))) \ + +#define _mm_mask_fpclass_sh_mask(U, X, C) \ + ((__mmask8) __builtin_ia32_fpclasssh_mask ((__v8hf) (__m128h) (X), \ + (int) (C), (__mmask8) (U))) +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vfpclassph. */ +#ifdef __OPTIMIZE__ +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fpclass_ph_mask (__mmask32 __U, __m512h __A, + const int __imm) +{ + return (__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) __A, + __imm, __U); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fpclass_ph_mask (__m512h __A, const int __imm) +{ + return (__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) __A, + __imm, + (__mmask32) -1); +} + +#else +#define _mm512_mask_fpclass_ph_mask(u, x, c) \ + ((__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) (__m512h) (x), \ + (int) (c),(__mmask8)(u))) + +#define _mm512_fpclass_ph_mask(x, c) \ + ((__mmask32) __builtin_ia32_fpclassph512_mask ((__v32hf) (__m512h) (x), \ + (int) (c),(__mmask8)-1)) +#endif /* __OPIMTIZE__ */ + +/* Intrinsics vgetexpph, vgetexpsh. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getexp_sh (__m128h __A, __m128h __B) +{ + return (__m128h) + __builtin_ia32_getexpsh_mask_round ((__v8hf) __A, (__v8hf) __B, + (__v8hf) _mm_setzero_ph (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getexp_sh (__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) +{ + return (__m128h) + __builtin_ia32_getexpsh_mask_round ((__v8hf) __A, (__v8hf) __B, + (__v8hf) __W, (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getexp_sh (__mmask8 __U, __m128h __A, __m128h __B) +{ + return (__m128h) + __builtin_ia32_getexpsh_mask_round ((__v8hf) __A, (__v8hf) __B, + (__v8hf) _mm_setzero_ph (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getexp_ph (__m512h __A) +{ + return (__m512h) + __builtin_ia32_getexpph512_mask ((__v32hf) __A, + (__v32hf) _mm512_setzero_ph (), + (__mmask32) -1, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getexp_ph (__m512h __W, __mmask32 __U, __m512h __A) +{ + return (__m512h) + __builtin_ia32_getexpph512_mask ((__v32hf) __A, (__v32hf) __W, + (__mmask32) __U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getexp_ph (__mmask32 __U, __m512h __A) +{ + return (__m512h) + __builtin_ia32_getexpph512_mask ((__v32hf) __A, + (__v32hf) _mm512_setzero_ph (), + (__mmask32) __U, _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getexp_round_sh (__m128h __A, __m128h __B, const int __R) +{ + return (__m128h) __builtin_ia32_getexpsh_mask_round ((__v8hf) __A, + (__v8hf) __B, + _mm_setzero_ph (), + (__mmask8) -1, + __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getexp_round_sh (__m128h __W, __mmask8 __U, __m128h __A, + __m128h __B, const int __R) +{ + return (__m128h) __builtin_ia32_getexpsh_mask_round ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __W, + (__mmask8) __U, __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getexp_round_sh (__mmask8 __U, __m128h __A, __m128h __B, + const int __R) +{ + return (__m128h) __builtin_ia32_getexpsh_mask_round ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) + _mm_setzero_ph (), + (__mmask8) __U, __R); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getexp_round_ph (__m512h __A, const int __R) +{ + return (__m512h) __builtin_ia32_getexpph512_mask ((__v32hf) __A, + (__v32hf) + _mm512_setzero_ph (), + (__mmask32) -1, __R); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getexp_round_ph (__m512h __W, __mmask32 __U, __m512h __A, + const int __R) +{ + return (__m512h) __builtin_ia32_getexpph512_mask ((__v32hf) __A, + (__v32hf) __W, + (__mmask32) __U, __R); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getexp_round_ph (__mmask32 __U, __m512h __A, const int __R) +{ + return (__m512h) __builtin_ia32_getexpph512_mask ((__v32hf) __A, + (__v32hf) + _mm512_setzero_ph (), + (__mmask32) __U, __R); +} + +#else +#define _mm_getexp_round_sh(A, B, R) \ + ((__m128h)__builtin_ia32_getexpsh_mask_round((__v8hf)(__m128h)(A), \ + (__v8hf)(__m128h)(B), \ + (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, R)) + +#define _mm_mask_getexp_round_sh(W, U, A, B, C) \ + (__m128h)__builtin_ia32_getexpsh_mask_round(A, B, W, U, C) + +#define _mm_maskz_getexp_round_sh(U, A, B, C) \ + (__m128h)__builtin_ia32_getexpsh_mask_round(A, B, \ + (__v8hf)_mm_setzero_ph(), \ + U, C) + +#define _mm512_getexp_round_ph(A, R) \ + ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \ + (__v32hf)_mm512_setzero_ph(), (__mmask32)-1, R)) + +#define _mm512_mask_getexp_round_ph(W, U, A, R) \ + ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \ + (__v32hf)(__m512h)(W), (__mmask32)(U), R)) + +#define _mm512_maskz_getexp_round_ph(U, A, R) \ + ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \ + (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), R)) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vgetmantph, vgetmantsh. */ +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getmant_sh (__m128h __A, __m128h __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D) +{ + return (__m128h) + __builtin_ia32_getmantsh_mask_round ((__v8hf) __A, (__v8hf) __B, + (__D << 2) | __C, _mm_setzero_ph (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getmant_sh (__m128h __W, __mmask8 __U, __m128h __A, + __m128h __B, _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D) +{ + return (__m128h) + __builtin_ia32_getmantsh_mask_round ((__v8hf) __A, (__v8hf) __B, + (__D << 2) | __C, (__v8hf) __W, + __U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getmant_sh (__mmask8 __U, __m128h __A, __m128h __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D) +{ + return (__m128h) + __builtin_ia32_getmantsh_mask_round ((__v8hf) __A, (__v8hf) __B, + (__D << 2) | __C, + (__v8hf) _mm_setzero_ph(), + __U, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getmant_ph (__m512h __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A, + (__C << 2) | __B, + _mm512_setzero_ph (), + (__mmask32) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getmant_ph (__m512h __W, __mmask32 __U, __m512h __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A, + (__C << 2) | __B, + (__v32hf) __W, __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getmant_ph (__mmask32 __U, __m512h __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A, + (__C << 2) | __B, + (__v32hf) + _mm512_setzero_ph (), + __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getmant_round_sh (__m128h __A, __m128h __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) +{ + return (__m128h) __builtin_ia32_getmantsh_mask_round ((__v8hf) __A, + (__v8hf) __B, + (__D << 2) | __C, + _mm_setzero_ph (), + (__mmask8) -1, + __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getmant_round_sh (__m128h __W, __mmask8 __U, __m128h __A, + __m128h __B, _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) +{ + return (__m128h) __builtin_ia32_getmantsh_mask_round ((__v8hf) __A, + (__v8hf) __B, + (__D << 2) | __C, + (__v8hf) __W, + __U, __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getmant_round_sh (__mmask8 __U, __m128h __A, __m128h __B, + _MM_MANTISSA_NORM_ENUM __C, + _MM_MANTISSA_SIGN_ENUM __D, const int __R) +{ + return (__m128h) __builtin_ia32_getmantsh_mask_round ((__v8hf) __A, + (__v8hf) __B, + (__D << 2) | __C, + (__v8hf) + _mm_setzero_ph(), + __U, __R); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_getmant_round_ph (__m512h __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) +{ + return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A, + (__C << 2) | __B, + _mm512_setzero_ph (), + (__mmask32) -1, __R); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_getmant_round_ph (__m512h __W, __mmask32 __U, __m512h __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) +{ + return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A, + (__C << 2) | __B, + (__v32hf) __W, __U, + __R); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_getmant_round_ph (__mmask32 __U, __m512h __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C, const int __R) +{ + return (__m512h) __builtin_ia32_getmantph512_mask ((__v32hf) __A, + (__C << 2) | __B, + (__v32hf) + _mm512_setzero_ph (), + __U, __R); +} + +#else +#define _mm512_getmant_ph(X, B, C) \ + ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X), \ + (int)(((C)<<2) | (B)), \ + (__v32hf)(__m512h) \ + _mm512_setzero_ph(), \ + (__mmask32)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_getmant_ph(W, U, X, B, C) \ + ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X), \ + (int)(((C)<<2) | (B)), \ + (__v32hf)(__m512h)(W), \ + (__mmask32)(U), \ + _MM_FROUND_CUR_DIRECTION)) + + +#define _mm512_maskz_getmant_ph(U, X, B, C) \ + ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X), \ + (int)(((C)<<2) | (B)), \ + (__v32hf)(__m512h) \ + _mm512_setzero_ph(), \ + (__mmask32)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_getmant_sh(X, Y, C, D) \ + ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X), \ + (__v8hf)(__m128h)(Y), \ + (int)(((D)<<2) | (C)), \ + (__v8hf)(__m128h) \ + _mm_setzero_ph (), \ + (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_getmant_sh(W, U, X, Y, C, D) \ + ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X), \ + (__v8hf)(__m128h)(Y), \ + (int)(((D)<<2) | (C)), \ + (__v8hf)(__m128h)(W), \ + (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_getmant_sh(U, X, Y, C, D) \ + ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X), \ + (__v8hf)(__m128h)(Y), \ + (int)(((D)<<2) | (C)), \ + (__v8hf)(__m128h) \ + _mm_setzero_ph(), \ + (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_getmant_round_ph(X, B, C, R) \ + ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X), \ + (int)(((C)<<2) | (B)), \ + (__v32hf)(__m512h) \ + _mm512_setzero_ph(), \ + (__mmask32)-1, \ + (R))) + +#define _mm512_mask_getmant_round_ph(W, U, X, B, C, R) \ + ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X), \ + (int)(((C)<<2) | (B)), \ + (__v32hf)(__m512h)(W), \ + (__mmask32)(U), \ + (R))) + + +#define _mm512_maskz_getmant_round_ph(U, X, B, C, R) \ + ((__m512h)__builtin_ia32_getmantph512_mask ((__v32hf)(__m512h)(X), \ + (int)(((C)<<2) | (B)), \ + (__v32hf)(__m512h) \ + _mm512_setzero_ph(), \ + (__mmask32)(U), \ + (R))) + +#define _mm_getmant_round_sh(X, Y, C, D, R) \ + ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X), \ + (__v8hf)(__m128h)(Y), \ + (int)(((D)<<2) | (C)), \ + (__v8hf)(__m128h) \ + _mm_setzero_ph (), \ + (__mmask8)-1, \ + (R))) + +#define _mm_mask_getmant_round_sh(W, U, X, Y, C, D, R) \ + ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X), \ + (__v8hf)(__m128h)(Y), \ + (int)(((D)<<2) | (C)), \ + (__v8hf)(__m128h)(W), \ + (__mmask8)(U), \ + (R))) + +#define _mm_maskz_getmant_round_sh(U, X, Y, C, D, R) \ + ((__m128h)__builtin_ia32_getmantsh_mask_round ((__v8hf)(__m128h)(X), \ + (__v8hf)(__m128h)(Y), \ + (int)(((D)<<2) | (C)), \ + (__v8hf)(__m128h) \ + _mm_setzero_ph(), \ + (__mmask8)(U), \ + (R))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vmovw. */ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi16_si128 (short __A) +{ + return _mm_set_epi16 (0, 0, 0, 0, 0, 0, 0, __A); +} + +extern __inline short +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi128_si16 (__m128i __A) +{ + return __builtin_ia32_vec_ext_v8hi ((__v8hi)__A, 0); +} + +/* Intrinsics vmovsh. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_load_sh (__m128h __A, __mmask8 __B, _Float16 const* __C) +{ + return __builtin_ia32_loadsh_mask (__C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_load_sh (__mmask8 __A, _Float16 const* __B) +{ + return __builtin_ia32_loadsh_mask (__B, _mm_setzero_ph (), __A); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_store_sh (_Float16 const* __A, __mmask8 __B, __m128h __C) +{ + __builtin_ia32_storesh_mask (__A, __C, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_move_sh (__m128h __A, __m128h __B) +{ + __A[0] = __B[0]; + return __A; +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_move_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_vmovsh_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_move_sh (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_vmovsh_mask (__B, __C, _mm_setzero_ph (), __A); +} + +/* Intrinsics vcvtph2dq. */ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtph_epi32 (__m256h __A) +{ + return (__m512i) + __builtin_ia32_vcvtph2dq512_mask_round (__A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtph_epi32 (__m512i __A, __mmask16 __B, __m256h __C) +{ + return (__m512i) + __builtin_ia32_vcvtph2dq512_mask_round (__C, + (__v16si) __A, + __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtph_epi32 (__mmask16 __A, __m256h __B) +{ + return (__m512i) + __builtin_ia32_vcvtph2dq512_mask_round (__B, + (__v16si) + _mm512_setzero_si512 (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundph_epi32 (__m256h __A, int __B) +{ + return (__m512i) + __builtin_ia32_vcvtph2dq512_mask_round (__A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1, + __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundph_epi32 (__m512i __A, __mmask16 __B, __m256h __C, int __D) +{ + return (__m512i) + __builtin_ia32_vcvtph2dq512_mask_round (__C, + (__v16si) __A, + __B, + __D); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundph_epi32 (__mmask16 __A, __m256h __B, int __C) +{ + return (__m512i) + __builtin_ia32_vcvtph2dq512_mask_round (__B, + (__v16si) + _mm512_setzero_si512 (), + __A, + __C); +} + +#else +#define _mm512_cvt_roundph_epi32(A, B) \ + ((__m512i) \ + __builtin_ia32_vcvtph2dq512_mask_round ((A), \ + (__v16si) \ + _mm512_setzero_si512 (), \ + (__mmask16)-1, \ + (B))) + +#define _mm512_mask_cvt_roundph_epi32(A, B, C, D) \ + ((__m512i) \ + __builtin_ia32_vcvtph2dq512_mask_round ((C), (__v16si)(A), (B), (D))) + +#define _mm512_maskz_cvt_roundph_epi32(A, B, C) \ + ((__m512i) \ + __builtin_ia32_vcvtph2dq512_mask_round ((B), \ + (__v16si) \ + _mm512_setzero_si512 (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtph2udq. */ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtph_epu32 (__m256h __A) +{ + return (__m512i) + __builtin_ia32_vcvtph2udq512_mask_round (__A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtph_epu32 (__m512i __A, __mmask16 __B, __m256h __C) +{ + return (__m512i) + __builtin_ia32_vcvtph2udq512_mask_round (__C, + (__v16si) __A, + __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtph_epu32 (__mmask16 __A, __m256h __B) +{ + return (__m512i) + __builtin_ia32_vcvtph2udq512_mask_round (__B, + (__v16si) + _mm512_setzero_si512 (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundph_epu32 (__m256h __A, int __B) +{ + return (__m512i) + __builtin_ia32_vcvtph2udq512_mask_round (__A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1, + __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundph_epu32 (__m512i __A, __mmask16 __B, __m256h __C, int __D) +{ + return (__m512i) + __builtin_ia32_vcvtph2udq512_mask_round (__C, + (__v16si) __A, + __B, + __D); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundph_epu32 (__mmask16 __A, __m256h __B, int __C) +{ + return (__m512i) + __builtin_ia32_vcvtph2udq512_mask_round (__B, + (__v16si) + _mm512_setzero_si512 (), + __A, + __C); +} + +#else +#define _mm512_cvt_roundph_epu32(A, B) \ + ((__m512i) \ + __builtin_ia32_vcvtph2udq512_mask_round ((A), \ + (__v16si) \ + _mm512_setzero_si512 (), \ + (__mmask16)-1, \ + (B))) + +#define _mm512_mask_cvt_roundph_epu32(A, B, C, D) \ + ((__m512i) \ + __builtin_ia32_vcvtph2udq512_mask_round ((C), (__v16si)(A), (B), (D))) + +#define _mm512_maskz_cvt_roundph_epu32(A, B, C) \ + ((__m512i) \ + __builtin_ia32_vcvtph2udq512_mask_round ((B), \ + (__v16si) \ + _mm512_setzero_si512 (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvttph2dq. */ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvttph_epi32 (__m256h __A) +{ + return (__m512i) + __builtin_ia32_vcvttph2dq512_mask_round (__A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvttph_epi32 (__m512i __A, __mmask16 __B, __m256h __C) +{ + return (__m512i) + __builtin_ia32_vcvttph2dq512_mask_round (__C, + (__v16si) __A, + __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvttph_epi32 (__mmask16 __A, __m256h __B) +{ + return (__m512i) + __builtin_ia32_vcvttph2dq512_mask_round (__B, + (__v16si) + _mm512_setzero_si512 (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtt_roundph_epi32 (__m256h __A, int __B) +{ + return (__m512i) + __builtin_ia32_vcvttph2dq512_mask_round (__A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1, + __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtt_roundph_epi32 (__m512i __A, __mmask16 __B, + __m256h __C, int __D) +{ + return (__m512i) + __builtin_ia32_vcvttph2dq512_mask_round (__C, + (__v16si) __A, + __B, + __D); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtt_roundph_epi32 (__mmask16 __A, __m256h __B, int __C) +{ + return (__m512i) + __builtin_ia32_vcvttph2dq512_mask_round (__B, + (__v16si) + _mm512_setzero_si512 (), + __A, + __C); +} + +#else +#define _mm512_cvtt_roundph_epi32(A, B) \ + ((__m512i) \ + __builtin_ia32_vcvttph2dq512_mask_round ((A), \ + (__v16si) \ + (_mm512_setzero_si512 ()), \ + (__mmask16)(-1), (B))) + +#define _mm512_mask_cvtt_roundph_epi32(A, B, C, D) \ + ((__m512i) \ + __builtin_ia32_vcvttph2dq512_mask_round ((C), \ + (__v16si)(A), \ + (B), \ + (D))) + +#define _mm512_maskz_cvtt_roundph_epi32(A, B, C) \ + ((__m512i) \ + __builtin_ia32_vcvttph2dq512_mask_round ((B), \ + (__v16si) \ + _mm512_setzero_si512 (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvttph2udq. */ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvttph_epu32 (__m256h __A) +{ + return (__m512i) + __builtin_ia32_vcvttph2udq512_mask_round (__A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvttph_epu32 (__m512i __A, __mmask16 __B, __m256h __C) +{ + return (__m512i) + __builtin_ia32_vcvttph2udq512_mask_round (__C, + (__v16si) __A, + __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvttph_epu32 (__mmask16 __A, __m256h __B) +{ + return (__m512i) + __builtin_ia32_vcvttph2udq512_mask_round (__B, + (__v16si) + _mm512_setzero_si512 (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtt_roundph_epu32 (__m256h __A, int __B) +{ + return (__m512i) + __builtin_ia32_vcvttph2udq512_mask_round (__A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1, + __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtt_roundph_epu32 (__m512i __A, __mmask16 __B, + __m256h __C, int __D) +{ + return (__m512i) + __builtin_ia32_vcvttph2udq512_mask_round (__C, + (__v16si) __A, + __B, + __D); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtt_roundph_epu32 (__mmask16 __A, __m256h __B, int __C) +{ + return (__m512i) + __builtin_ia32_vcvttph2udq512_mask_round (__B, + (__v16si) + _mm512_setzero_si512 (), + __A, + __C); +} + +#else +#define _mm512_cvtt_roundph_epu32(A, B) \ + ((__m512i) \ + __builtin_ia32_vcvttph2udq512_mask_round ((A), \ + (__v16si) \ + _mm512_setzero_si512 (), \ + (__mmask16)-1, \ + (B))) + +#define _mm512_mask_cvtt_roundph_epu32(A, B, C, D) \ + ((__m512i) \ + __builtin_ia32_vcvttph2udq512_mask_round ((C), \ + (__v16si)(A), \ + (B), \ + (D))) + +#define _mm512_maskz_cvtt_roundph_epu32(A, B, C) \ + ((__m512i) \ + __builtin_ia32_vcvttph2udq512_mask_round ((B), \ + (__v16si) \ + _mm512_setzero_si512 (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtdq2ph. */ +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi32_ph (__m512i __A) +{ + return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __A, + _mm256_setzero_ph (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi32_ph (__m256h __A, __mmask16 __B, __m512i __C) +{ + return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __C, + __A, + __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi32_ph (__mmask16 __A, __m512i __B) +{ + return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __B, + _mm256_setzero_ph (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundepi32_ph (__m512i __A, int __B) +{ + return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __A, + _mm256_setzero_ph (), + (__mmask16) -1, + __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundepi32_ph (__m256h __A, __mmask16 __B, __m512i __C, int __D) +{ + return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __C, + __A, + __B, + __D); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundepi32_ph (__mmask16 __A, __m512i __B, int __C) +{ + return __builtin_ia32_vcvtdq2ph512_mask_round ((__v16si) __B, + _mm256_setzero_ph (), + __A, + __C); +} + +#else +#define _mm512_cvt_roundepi32_ph(A, B) \ + (__builtin_ia32_vcvtdq2ph512_mask_round ((__v16si)(A), \ + _mm256_setzero_ph (), \ + (__mmask16)-1, \ + (B))) + +#define _mm512_mask_cvt_roundepi32_ph(A, B, C, D) \ + (__builtin_ia32_vcvtdq2ph512_mask_round ((__v16si)(C), \ + (A), \ + (B), \ + (D))) + +#define _mm512_maskz_cvt_roundepi32_ph(A, B, C) \ + (__builtin_ia32_vcvtdq2ph512_mask_round ((__v16si)(B), \ + _mm256_setzero_ph (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtudq2ph. */ +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepu32_ph (__m512i __A) +{ + return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __A, + _mm256_setzero_ph (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu32_ph (__m256h __A, __mmask16 __B, __m512i __C) +{ + return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __C, + __A, + __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu32_ph (__mmask16 __A, __m512i __B) +{ + return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __B, + _mm256_setzero_ph (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundepu32_ph (__m512i __A, int __B) +{ + return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __A, + _mm256_setzero_ph (), + (__mmask16) -1, + __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundepu32_ph (__m256h __A, __mmask16 __B, __m512i __C, int __D) +{ + return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __C, + __A, + __B, + __D); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundepu32_ph (__mmask16 __A, __m512i __B, int __C) +{ + return __builtin_ia32_vcvtudq2ph512_mask_round ((__v16si) __B, + _mm256_setzero_ph (), + __A, + __C); +} + +#else +#define _mm512_cvt_roundepu32_ph(A, B) \ + (__builtin_ia32_vcvtudq2ph512_mask_round ((__v16si)(A), \ + _mm256_setzero_ph (), \ + (__mmask16)-1, \ + B)) + +#define _mm512_mask_cvt_roundepu32_ph(A, B, C, D) \ + (__builtin_ia32_vcvtudq2ph512_mask_round ((__v16si)C, \ + A, \ + B, \ + D)) + +#define _mm512_maskz_cvt_roundepu32_ph(A, B, C) \ + (__builtin_ia32_vcvtudq2ph512_mask_round ((__v16si)B, \ + _mm256_setzero_ph (), \ + A, \ + C)) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtph2qq. */ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtph_epi64 (__m128h __A) +{ + return __builtin_ia32_vcvtph2qq512_mask_round (__A, + _mm512_setzero_si512 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtph_epi64 (__m512i __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvtph2qq512_mask_round (__C, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtph_epi64 (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvtph2qq512_mask_round (__B, + _mm512_setzero_si512 (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundph_epi64 (__m128h __A, int __B) +{ + return __builtin_ia32_vcvtph2qq512_mask_round (__A, + _mm512_setzero_si512 (), + (__mmask8) -1, + __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundph_epi64 (__m512i __A, __mmask8 __B, __m128h __C, int __D) +{ + return __builtin_ia32_vcvtph2qq512_mask_round (__C, __A, __B, __D); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundph_epi64 (__mmask8 __A, __m128h __B, int __C) +{ + return __builtin_ia32_vcvtph2qq512_mask_round (__B, + _mm512_setzero_si512 (), + __A, + __C); +} + +#else +#define _mm512_cvt_roundph_epi64(A, B) \ + (__builtin_ia32_vcvtph2qq512_mask_round ((A), \ + _mm512_setzero_si512 (), \ + (__mmask8)-1, \ + (B))) + +#define _mm512_mask_cvt_roundph_epi64(A, B, C, D) \ + (__builtin_ia32_vcvtph2qq512_mask_round ((C), (A), (B), (D))) + +#define _mm512_maskz_cvt_roundph_epi64(A, B, C) \ + (__builtin_ia32_vcvtph2qq512_mask_round ((B), \ + _mm512_setzero_si512 (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtph2uqq. */ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtph_epu64 (__m128h __A) +{ + return __builtin_ia32_vcvtph2uqq512_mask_round (__A, + _mm512_setzero_si512 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtph_epu64 (__m512i __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvtph2uqq512_mask_round (__C, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtph_epu64 (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvtph2uqq512_mask_round (__B, + _mm512_setzero_si512 (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundph_epu64 (__m128h __A, int __B) +{ + return __builtin_ia32_vcvtph2uqq512_mask_round (__A, + _mm512_setzero_si512 (), + (__mmask8) -1, + __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundph_epu64 (__m512i __A, __mmask8 __B, __m128h __C, int __D) +{ + return __builtin_ia32_vcvtph2uqq512_mask_round (__C, __A, __B, __D); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundph_epu64 (__mmask8 __A, __m128h __B, int __C) +{ + return __builtin_ia32_vcvtph2uqq512_mask_round (__B, + _mm512_setzero_si512 (), + __A, + __C); +} + +#else +#define _mm512_cvt_roundph_epu64(A, B) \ + (__builtin_ia32_vcvtph2uqq512_mask_round ((A), \ + _mm512_setzero_si512 (), \ + (__mmask8)-1, \ + (B))) + +#define _mm512_mask_cvt_roundph_epu64(A, B, C, D) \ + (__builtin_ia32_vcvtph2uqq512_mask_round ((C), (A), (B), (D))) + +#define _mm512_maskz_cvt_roundph_epu64(A, B, C) \ + (__builtin_ia32_vcvtph2uqq512_mask_round ((B), \ + _mm512_setzero_si512 (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvttph2qq. */ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvttph_epi64 (__m128h __A) +{ + return __builtin_ia32_vcvttph2qq512_mask_round (__A, + _mm512_setzero_si512 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvttph_epi64 (__m512i __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvttph2qq512_mask_round (__C, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvttph_epi64 (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvttph2qq512_mask_round (__B, + _mm512_setzero_si512 (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtt_roundph_epi64 (__m128h __A, int __B) +{ + return __builtin_ia32_vcvttph2qq512_mask_round (__A, + _mm512_setzero_si512 (), + (__mmask8) -1, + __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtt_roundph_epi64 (__m512i __A, __mmask8 __B, __m128h __C, int __D) +{ + return __builtin_ia32_vcvttph2qq512_mask_round (__C, __A, __B, __D); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtt_roundph_epi64 (__mmask8 __A, __m128h __B, int __C) +{ + return __builtin_ia32_vcvttph2qq512_mask_round (__B, + _mm512_setzero_si512 (), + __A, + __C); +} + +#else +#define _mm512_cvtt_roundph_epi64(A, B) \ + (__builtin_ia32_vcvttph2qq512_mask_round ((A), \ + _mm512_setzero_si512 (), \ + (__mmask8)-1, \ + (B))) + +#define _mm512_mask_cvtt_roundph_epi64(A, B, C, D) \ + __builtin_ia32_vcvttph2qq512_mask_round ((C), (A), (B), (D)) + +#define _mm512_maskz_cvtt_roundph_epi64(A, B, C) \ + (__builtin_ia32_vcvttph2qq512_mask_round ((B), \ + _mm512_setzero_si512 (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvttph2uqq. */ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvttph_epu64 (__m128h __A) +{ + return __builtin_ia32_vcvttph2uqq512_mask_round (__A, + _mm512_setzero_si512 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvttph_epu64 (__m512i __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvttph2uqq512_mask_round (__C, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvttph_epu64 (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvttph2uqq512_mask_round (__B, + _mm512_setzero_si512 (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtt_roundph_epu64 (__m128h __A, int __B) +{ + return __builtin_ia32_vcvttph2uqq512_mask_round (__A, + _mm512_setzero_si512 (), + (__mmask8) -1, + __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtt_roundph_epu64 (__m512i __A, __mmask8 __B, __m128h __C, int __D) +{ + return __builtin_ia32_vcvttph2uqq512_mask_round (__C, __A, __B, __D); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtt_roundph_epu64 (__mmask8 __A, __m128h __B, int __C) +{ + return __builtin_ia32_vcvttph2uqq512_mask_round (__B, + _mm512_setzero_si512 (), + __A, + __C); +} + +#else +#define _mm512_cvtt_roundph_epu64(A, B) \ + (__builtin_ia32_vcvttph2uqq512_mask_round ((A), \ + _mm512_setzero_si512 (), \ + (__mmask8)-1, \ + (B))) + +#define _mm512_mask_cvtt_roundph_epu64(A, B, C, D) \ + __builtin_ia32_vcvttph2uqq512_mask_round ((C), (A), (B), (D)) + +#define _mm512_maskz_cvtt_roundph_epu64(A, B, C) \ + (__builtin_ia32_vcvttph2uqq512_mask_round ((B), \ + _mm512_setzero_si512 (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtqq2ph. */ +extern __inline __m128h + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi64_ph (__m512i __A) +{ + return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __A, + _mm_setzero_ph (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi64_ph (__m128h __A, __mmask8 __B, __m512i __C) +{ + return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __C, + __A, + __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi64_ph (__mmask8 __A, __m512i __B) +{ + return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __B, + _mm_setzero_ph (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundepi64_ph (__m512i __A, int __B) +{ + return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __A, + _mm_setzero_ph (), + (__mmask8) -1, + __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundepi64_ph (__m128h __A, __mmask8 __B, __m512i __C, int __D) +{ + return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __C, + __A, + __B, + __D); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundepi64_ph (__mmask8 __A, __m512i __B, int __C) +{ + return __builtin_ia32_vcvtqq2ph512_mask_round ((__v8di) __B, + _mm_setzero_ph (), + __A, + __C); +} + +#else +#define _mm512_cvt_roundepi64_ph(A, B) \ + (__builtin_ia32_vcvtqq2ph512_mask_round ((__v8di)(A), \ + _mm_setzero_ph (), \ + (__mmask8)-1, \ + (B))) + +#define _mm512_mask_cvt_roundepi64_ph(A, B, C, D) \ + (__builtin_ia32_vcvtqq2ph512_mask_round ((__v8di)(C), (A), (B), (D))) + +#define _mm512_maskz_cvt_roundepi64_ph(A, B, C) \ + (__builtin_ia32_vcvtqq2ph512_mask_round ((__v8di)(B), \ + _mm_setzero_ph (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtuqq2ph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepu64_ph (__m512i __A) +{ + return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __A, + _mm_setzero_ph (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu64_ph (__m128h __A, __mmask8 __B, __m512i __C) +{ + return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __C, + __A, + __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu64_ph (__mmask8 __A, __m512i __B) +{ + return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __B, + _mm_setzero_ph (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundepu64_ph (__m512i __A, int __B) +{ + return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __A, + _mm_setzero_ph (), + (__mmask8) -1, + __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundepu64_ph (__m128h __A, __mmask8 __B, __m512i __C, int __D) +{ + return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __C, + __A, + __B, + __D); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundepu64_ph (__mmask8 __A, __m512i __B, int __C) +{ + return __builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di) __B, + _mm_setzero_ph (), + __A, + __C); +} + +#else +#define _mm512_cvt_roundepu64_ph(A, B) \ + (__builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di)(A), \ + _mm_setzero_ph (), \ + (__mmask8)-1, \ + (B))) + +#define _mm512_mask_cvt_roundepu64_ph(A, B, C, D) \ + (__builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di)(C), (A), (B), (D))) + +#define _mm512_maskz_cvt_roundepu64_ph(A, B, C) \ + (__builtin_ia32_vcvtuqq2ph512_mask_round ((__v8di)(B), \ + _mm_setzero_ph (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtph2w. */ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtph_epi16 (__m512h __A) +{ + return (__m512i) + __builtin_ia32_vcvtph2w512_mask_round (__A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtph_epi16 (__m512i __A, __mmask32 __B, __m512h __C) +{ + return (__m512i) + __builtin_ia32_vcvtph2w512_mask_round (__C, + (__v32hi) __A, + __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtph_epi16 (__mmask32 __A, __m512h __B) +{ + return (__m512i) + __builtin_ia32_vcvtph2w512_mask_round (__B, + (__v32hi) + _mm512_setzero_si512 (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundph_epi16 (__m512h __A, int __B) +{ + return (__m512i) + __builtin_ia32_vcvtph2w512_mask_round (__A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1, + __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundph_epi16 (__m512i __A, __mmask32 __B, __m512h __C, int __D) +{ + return (__m512i) + __builtin_ia32_vcvtph2w512_mask_round (__C, + (__v32hi) __A, + __B, + __D); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundph_epi16 (__mmask32 __A, __m512h __B, int __C) +{ + return (__m512i) + __builtin_ia32_vcvtph2w512_mask_round (__B, + (__v32hi) + _mm512_setzero_si512 (), + __A, + __C); +} + +#else +#define _mm512_cvt_roundph_epi16(A, B) \ + ((__m512i)__builtin_ia32_vcvtph2w512_mask_round ((A), \ + (__v32hi) \ + _mm512_setzero_si512 (), \ + (__mmask32)-1, \ + (B))) + +#define _mm512_mask_cvt_roundph_epi16(A, B, C, D) \ + ((__m512i)__builtin_ia32_vcvtph2w512_mask_round ((C), \ + (__v32hi)(A), \ + (B), \ + (D))) + +#define _mm512_maskz_cvt_roundph_epi16(A, B, C) \ + ((__m512i)__builtin_ia32_vcvtph2w512_mask_round ((B), \ + (__v32hi) \ + _mm512_setzero_si512 (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtph2uw. */ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtph_epu16 (__m512h __A) +{ + return (__m512i) + __builtin_ia32_vcvtph2uw512_mask_round (__A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtph_epu16 (__m512i __A, __mmask32 __B, __m512h __C) +{ + return (__m512i) + __builtin_ia32_vcvtph2uw512_mask_round (__C, (__v32hi) __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtph_epu16 (__mmask32 __A, __m512h __B) +{ + return (__m512i) + __builtin_ia32_vcvtph2uw512_mask_round (__B, + (__v32hi) + _mm512_setzero_si512 (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundph_epu16 (__m512h __A, int __B) +{ + return (__m512i) + __builtin_ia32_vcvtph2uw512_mask_round (__A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1, + __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundph_epu16 (__m512i __A, __mmask32 __B, __m512h __C, int __D) +{ + return (__m512i) + __builtin_ia32_vcvtph2uw512_mask_round (__C, (__v32hi) __A, __B, __D); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundph_epu16 (__mmask32 __A, __m512h __B, int __C) +{ + return (__m512i) + __builtin_ia32_vcvtph2uw512_mask_round (__B, + (__v32hi) + _mm512_setzero_si512 (), + __A, + __C); +} + +#else +#define _mm512_cvt_roundph_epu16(A, B) \ + ((__m512i) \ + __builtin_ia32_vcvtph2uw512_mask_round ((A), \ + (__v32hi) \ + _mm512_setzero_si512 (), \ + (__mmask32)-1, (B))) + +#define _mm512_mask_cvt_roundph_epu16(A, B, C, D) \ + ((__m512i) \ + __builtin_ia32_vcvtph2uw512_mask_round ((C), (__v32hi)(A), (B), (D))) + +#define _mm512_maskz_cvt_roundph_epu16(A, B, C) \ + ((__m512i) \ + __builtin_ia32_vcvtph2uw512_mask_round ((B), \ + (__v32hi) \ + _mm512_setzero_si512 (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvttph2w. */ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvttph_epi16 (__m512h __A) +{ + return (__m512i) + __builtin_ia32_vcvttph2w512_mask_round (__A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvttph_epi16 (__m512i __A, __mmask32 __B, __m512h __C) +{ + return (__m512i) + __builtin_ia32_vcvttph2w512_mask_round (__C, + (__v32hi) __A, + __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvttph_epi16 (__mmask32 __A, __m512h __B) +{ + return (__m512i) + __builtin_ia32_vcvttph2w512_mask_round (__B, + (__v32hi) + _mm512_setzero_si512 (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtt_roundph_epi16 (__m512h __A, int __B) +{ + return (__m512i) + __builtin_ia32_vcvttph2w512_mask_round (__A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1, + __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtt_roundph_epi16 (__m512i __A, __mmask32 __B, + __m512h __C, int __D) +{ + return (__m512i) + __builtin_ia32_vcvttph2w512_mask_round (__C, + (__v32hi) __A, + __B, + __D); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtt_roundph_epi16 (__mmask32 __A, __m512h __B, int __C) +{ + return (__m512i) + __builtin_ia32_vcvttph2w512_mask_round (__B, + (__v32hi) + _mm512_setzero_si512 (), + __A, + __C); +} + +#else +#define _mm512_cvtt_roundph_epi16(A, B) \ + ((__m512i) \ + __builtin_ia32_vcvttph2w512_mask_round ((A), \ + (__v32hi) \ + _mm512_setzero_si512 (), \ + (__mmask32)-1, \ + (B))) + +#define _mm512_mask_cvtt_roundph_epi16(A, B, C, D) \ + ((__m512i) \ + __builtin_ia32_vcvttph2w512_mask_round ((C), \ + (__v32hi)(A), \ + (B), \ + (D))) + +#define _mm512_maskz_cvtt_roundph_epi16(A, B, C) \ + ((__m512i) \ + __builtin_ia32_vcvttph2w512_mask_round ((B), \ + (__v32hi) \ + _mm512_setzero_si512 (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvttph2uw. */ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvttph_epu16 (__m512h __A) +{ + return (__m512i) + __builtin_ia32_vcvttph2uw512_mask_round (__A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvttph_epu16 (__m512i __A, __mmask32 __B, __m512h __C) +{ + return (__m512i) + __builtin_ia32_vcvttph2uw512_mask_round (__C, + (__v32hi) __A, + __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvttph_epu16 (__mmask32 __A, __m512h __B) +{ + return (__m512i) + __builtin_ia32_vcvttph2uw512_mask_round (__B, + (__v32hi) + _mm512_setzero_si512 (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtt_roundph_epu16 (__m512h __A, int __B) +{ + return (__m512i) + __builtin_ia32_vcvttph2uw512_mask_round (__A, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) -1, + __B); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtt_roundph_epu16 (__m512i __A, __mmask32 __B, + __m512h __C, int __D) +{ + return (__m512i) + __builtin_ia32_vcvttph2uw512_mask_round (__C, + (__v32hi) __A, + __B, + __D); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtt_roundph_epu16 (__mmask32 __A, __m512h __B, int __C) +{ + return (__m512i) + __builtin_ia32_vcvttph2uw512_mask_round (__B, + (__v32hi) + _mm512_setzero_si512 (), + __A, + __C); +} + +#else +#define _mm512_cvtt_roundph_epu16(A, B) \ + ((__m512i) \ + __builtin_ia32_vcvttph2uw512_mask_round ((A), \ + (__v32hi) \ + _mm512_setzero_si512 (), \ + (__mmask32)-1, \ + (B))) + +#define _mm512_mask_cvtt_roundph_epu16(A, B, C, D) \ + ((__m512i) \ + __builtin_ia32_vcvttph2uw512_mask_round ((C), \ + (__v32hi)(A), \ + (B), \ + (D))) + +#define _mm512_maskz_cvtt_roundph_epu16(A, B, C) \ + ((__m512i) \ + __builtin_ia32_vcvttph2uw512_mask_round ((B), \ + (__v32hi) \ + _mm512_setzero_si512 (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtw2ph. */ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtepi16_ph (__m512i __A) +{ + return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __A, + _mm512_setzero_ph (), + (__mmask32) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepi16_ph (__m512h __A, __mmask32 __B, __m512i __C) +{ + return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __C, + __A, + __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepi16_ph (__mmask32 __A, __m512i __B) +{ + return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __B, + _mm512_setzero_ph (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundepi16_ph (__m512i __A, int __B) +{ + return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __A, + _mm512_setzero_ph (), + (__mmask32) -1, + __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundepi16_ph (__m512h __A, __mmask32 __B, __m512i __C, int __D) +{ + return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __C, + __A, + __B, + __D); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundepi16_ph (__mmask32 __A, __m512i __B, int __C) +{ + return __builtin_ia32_vcvtw2ph512_mask_round ((__v32hi) __B, + _mm512_setzero_ph (), + __A, + __C); +} + +#else +#define _mm512_cvt_roundepi16_ph(A, B) \ + (__builtin_ia32_vcvtw2ph512_mask_round ((__v32hi)(A), \ + _mm512_setzero_ph (), \ + (__mmask32)-1, \ + (B))) + +#define _mm512_mask_cvt_roundepi16_ph(A, B, C, D) \ + (__builtin_ia32_vcvtw2ph512_mask_round ((__v32hi)(C), \ + (A), \ + (B), \ + (D))) + +#define _mm512_maskz_cvt_roundepi16_ph(A, B, C) \ + (__builtin_ia32_vcvtw2ph512_mask_round ((__v32hi)(B), \ + _mm512_setzero_ph (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtuw2ph. */ + extern __inline __m512h + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_cvtepu16_ph (__m512i __A) + { + return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __A, + _mm512_setzero_ph (), + (__mmask32) -1, + _MM_FROUND_CUR_DIRECTION); + } + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtepu16_ph (__m512h __A, __mmask32 __B, __m512i __C) +{ + return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __C, + __A, + __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtepu16_ph (__mmask32 __A, __m512i __B) +{ + return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __B, + _mm512_setzero_ph (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundepu16_ph (__m512i __A, int __B) +{ + return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __A, + _mm512_setzero_ph (), + (__mmask32) -1, + __B); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundepu16_ph (__m512h __A, __mmask32 __B, __m512i __C, int __D) +{ + return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __C, + __A, + __B, + __D); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundepu16_ph (__mmask32 __A, __m512i __B, int __C) +{ + return __builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi) __B, + _mm512_setzero_ph (), + __A, + __C); +} + +#else +#define _mm512_cvt_roundepu16_ph(A, B) \ + (__builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi)(A), \ + _mm512_setzero_ph (), \ + (__mmask32)-1, \ + (B))) + +#define _mm512_mask_cvt_roundepu16_ph(A, B, C, D) \ + (__builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi)(C), \ + (A), \ + (B), \ + (D))) + +#define _mm512_maskz_cvt_roundepu16_ph(A, B, C) \ + (__builtin_ia32_vcvtuw2ph512_mask_round ((__v32hi)(B), \ + _mm512_setzero_ph (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtsh2si, vcvtsh2us. */ +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsh_i32 (__m128h __A) +{ + return (int) __builtin_ia32_vcvtsh2si32_round (__A, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsh_u32 (__m128h __A) +{ + return (int) __builtin_ia32_vcvtsh2usi32_round (__A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsh_i32 (__m128h __A, const int __R) +{ + return (int) __builtin_ia32_vcvtsh2si32_round (__A, __R); +} + +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsh_u32 (__m128h __A, const int __R) +{ + return (int) __builtin_ia32_vcvtsh2usi32_round (__A, __R); +} + +#else +#define _mm_cvt_roundsh_i32(A, B) \ + ((int)__builtin_ia32_vcvtsh2si32_round ((A), (B))) +#define _mm_cvt_roundsh_u32(A, B) \ + ((int)__builtin_ia32_vcvtsh2usi32_round ((A), (B))) + +#endif /* __OPTIMIZE__ */ + +#ifdef __x86_64__ +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsh_i64 (__m128h __A) +{ + return (long long) + __builtin_ia32_vcvtsh2si64_round (__A, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsh_u64 (__m128h __A) +{ + return (long long) + __builtin_ia32_vcvtsh2usi64_round (__A, _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsh_i64 (__m128h __A, const int __R) +{ + return (long long) __builtin_ia32_vcvtsh2si64_round (__A, __R); +} + +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsh_u64 (__m128h __A, const int __R) +{ + return (long long) __builtin_ia32_vcvtsh2usi64_round (__A, __R); +} + +#else +#define _mm_cvt_roundsh_i64(A, B) \ + ((long long)__builtin_ia32_vcvtsh2si64_round ((A), (B))) +#define _mm_cvt_roundsh_u64(A, B) \ + ((long long)__builtin_ia32_vcvtsh2usi64_round ((A), (B))) + +#endif /* __OPTIMIZE__ */ +#endif /* __x86_64__ */ + +/* Intrinsics vcvttsh2si, vcvttsh2us. */ +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsh_i32 (__m128h __A) +{ + return (int) + __builtin_ia32_vcvttsh2si32_round (__A, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsh_u32 (__m128h __A) +{ + return (int) + __builtin_ia32_vcvttsh2usi32_round (__A, _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundsh_i32 (__m128h __A, const int __R) +{ + return (int) __builtin_ia32_vcvttsh2si32_round (__A, __R); +} + +extern __inline unsigned +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundsh_u32 (__m128h __A, const int __R) +{ + return (int) __builtin_ia32_vcvttsh2usi32_round (__A, __R); +} + +#else +#define _mm_cvtt_roundsh_i32(A, B) \ + ((int)__builtin_ia32_vcvttsh2si32_round ((A), (B))) +#define _mm_cvtt_roundsh_u32(A, B) \ + ((int)__builtin_ia32_vcvttsh2usi32_round ((A), (B))) + +#endif /* __OPTIMIZE__ */ + +#ifdef __x86_64__ +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsh_i64 (__m128h __A) +{ + return (long long) + __builtin_ia32_vcvttsh2si64_round (__A, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsh_u64 (__m128h __A) +{ + return (long long) + __builtin_ia32_vcvttsh2usi64_round (__A, _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundsh_i64 (__m128h __A, const int __R) +{ + return (long long) __builtin_ia32_vcvttsh2si64_round (__A, __R); +} + +extern __inline unsigned long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_roundsh_u64 (__m128h __A, const int __R) +{ + return (long long) __builtin_ia32_vcvttsh2usi64_round (__A, __R); +} + +#else +#define _mm_cvtt_roundsh_i64(A, B) \ + ((long long)__builtin_ia32_vcvttsh2si64_round ((A), (B))) +#define _mm_cvtt_roundsh_u64(A, B) \ + ((long long)__builtin_ia32_vcvttsh2usi64_round ((A), (B))) + +#endif /* __OPTIMIZE__ */ +#endif /* __x86_64__ */ + +/* Intrinsics vcvtsi2sh, vcvtusi2sh. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvti32_sh (__m128h __A, int __B) +{ + return __builtin_ia32_vcvtsi2sh32_round (__A, __B, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtu32_sh (__m128h __A, unsigned int __B) +{ + return __builtin_ia32_vcvtusi2sh32_round (__A, __B, _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundi32_sh (__m128h __A, int __B, const int __R) +{ + return __builtin_ia32_vcvtsi2sh32_round (__A, __B, __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundu32_sh (__m128h __A, unsigned int __B, const int __R) +{ + return __builtin_ia32_vcvtusi2sh32_round (__A, __B, __R); +} + +#else +#define _mm_cvt_roundi32_sh(A, B, C) \ + (__builtin_ia32_vcvtsi2sh32_round ((A), (B), (C))) +#define _mm_cvt_roundu32_sh(A, B, C) \ + (__builtin_ia32_vcvtusi2sh32_round ((A), (B), (C))) + +#endif /* __OPTIMIZE__ */ + +#ifdef __x86_64__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvti64_sh (__m128h __A, long long __B) +{ + return __builtin_ia32_vcvtsi2sh64_round (__A, __B, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtu64_sh (__m128h __A, unsigned long long __B) +{ + return __builtin_ia32_vcvtusi2sh64_round (__A, __B, _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundi64_sh (__m128h __A, long long __B, const int __R) +{ + return __builtin_ia32_vcvtsi2sh64_round (__A, __B, __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundu64_sh (__m128h __A, unsigned long long __B, const int __R) +{ + return __builtin_ia32_vcvtusi2sh64_round (__A, __B, __R); +} + +#else +#define _mm_cvt_roundi64_sh(A, B, C) \ + (__builtin_ia32_vcvtsi2sh64_round ((A), (B), (C))) +#define _mm_cvt_roundu64_sh(A, B, C) \ + (__builtin_ia32_vcvtusi2sh64_round ((A), (B), (C))) + +#endif /* __OPTIMIZE__ */ +#endif /* __x86_64__ */ + +/* Intrinsics vcvtph2pd. */ +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtph_pd (__m128h __A) +{ + return __builtin_ia32_vcvtph2pd512_mask_round (__A, + _mm512_setzero_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtph_pd (__m512d __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvtph2pd512_mask_round (__C, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtph_pd (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvtph2pd512_mask_round (__B, + _mm512_setzero_pd (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundph_pd (__m128h __A, int __B) +{ + return __builtin_ia32_vcvtph2pd512_mask_round (__A, + _mm512_setzero_pd (), + (__mmask8) -1, + __B); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundph_pd (__m512d __A, __mmask8 __B, __m128h __C, int __D) +{ + return __builtin_ia32_vcvtph2pd512_mask_round (__C, __A, __B, __D); +} + +extern __inline __m512d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundph_pd (__mmask8 __A, __m128h __B, int __C) +{ + return __builtin_ia32_vcvtph2pd512_mask_round (__B, + _mm512_setzero_pd (), + __A, + __C); +} + +#else +#define _mm512_cvt_roundph_pd(A, B) \ + (__builtin_ia32_vcvtph2pd512_mask_round ((A), \ + _mm512_setzero_pd (), \ + (__mmask8)-1, \ + (B))) + +#define _mm512_mask_cvt_roundph_pd(A, B, C, D) \ + (__builtin_ia32_vcvtph2pd512_mask_round ((C), (A), (B), (D))) + +#define _mm512_maskz_cvt_roundph_pd(A, B, C) \ + (__builtin_ia32_vcvtph2pd512_mask_round ((B), \ + _mm512_setzero_pd (), \ + (A), \ + (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtph2psx. */ +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtxph_ps (__m256h __A) +{ + return __builtin_ia32_vcvtph2psx512_mask_round (__A, + _mm512_setzero_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtxph_ps (__m512 __A, __mmask16 __B, __m256h __C) +{ + return __builtin_ia32_vcvtph2psx512_mask_round (__C, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtxph_ps (__mmask16 __A, __m256h __B) +{ + return __builtin_ia32_vcvtph2psx512_mask_round (__B, + _mm512_setzero_ps (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtx_roundph_ps (__m256h __A, int __B) +{ + return __builtin_ia32_vcvtph2psx512_mask_round (__A, + _mm512_setzero_ps (), + (__mmask16) -1, + __B); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtx_roundph_ps (__m512 __A, __mmask16 __B, __m256h __C, int __D) +{ + return __builtin_ia32_vcvtph2psx512_mask_round (__C, __A, __B, __D); +} + +extern __inline __m512 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtx_roundph_ps (__mmask16 __A, __m256h __B, int __C) +{ + return __builtin_ia32_vcvtph2psx512_mask_round (__B, + _mm512_setzero_ps (), + __A, + __C); +} + +#else +#define _mm512_cvtx_roundph_ps(A, B) \ + (__builtin_ia32_vcvtph2psx512_mask_round ((A), \ + _mm512_setzero_ps (), \ + (__mmask16)-1, \ + (B))) + +#define _mm512_mask_cvtx_roundph_ps(A, B, C, D) \ + (__builtin_ia32_vcvtph2psx512_mask_round ((C), (A), (B), (D))) + +#define _mm512_maskz_cvtx_roundph_ps(A, B, C) \ + (__builtin_ia32_vcvtph2psx512_mask_round ((B), \ + _mm512_setzero_ps (), \ + (A), \ + (C))) +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtps2ph. */ +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtxps_ph (__m512 __A) +{ + return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __A, + _mm256_setzero_ph (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtxps_ph (__m256h __A, __mmask16 __B, __m512 __C) +{ + return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __C, + __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtxps_ph (__mmask16 __A, __m512 __B) +{ + return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __B, + _mm256_setzero_ph (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtx_roundps_ph (__m512 __A, int __B) +{ + return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __A, + _mm256_setzero_ph (), + (__mmask16) -1, + __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtx_roundps_ph (__m256h __A, __mmask16 __B, __m512 __C, int __D) +{ + return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __C, + __A, __B, __D); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtx_roundps_ph (__mmask16 __A, __m512 __B, int __C) +{ + return __builtin_ia32_vcvtps2phx512_mask_round ((__v16sf) __B, + _mm256_setzero_ph (), + __A, __C); +} + +#else +#define _mm512_cvtx_roundps_ph(A, B) \ + (__builtin_ia32_vcvtps2phx512_mask_round ((__v16sf)(A), \ + _mm256_setzero_ph (),\ + (__mmask16)-1, (B))) + +#define _mm512_mask_cvtx_roundps_ph(A, B, C, D) \ + (__builtin_ia32_vcvtps2phx512_mask_round ((__v16sf)(C), \ + (A), (B), (D))) + +#define _mm512_maskz_cvtx_roundps_ph(A, B, C) \ + (__builtin_ia32_vcvtps2phx512_mask_round ((__v16sf)(B), \ + _mm256_setzero_ph (),\ + (A), (C))) +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtpd2ph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvtpd_ph (__m512d __A) +{ + return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __A, + _mm_setzero_ph (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvtpd_ph (__m128h __A, __mmask8 __B, __m512d __C) +{ + return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __C, + __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvtpd_ph (__mmask8 __A, __m512d __B) +{ + return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __B, + _mm_setzero_ph (), + __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_cvt_roundpd_ph (__m512d __A, int __B) +{ + return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __A, + _mm_setzero_ph (), + (__mmask8) -1, + __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_cvt_roundpd_ph (__m128h __A, __mmask8 __B, __m512d __C, int __D) +{ + return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __C, + __A, __B, __D); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_cvt_roundpd_ph (__mmask8 __A, __m512d __B, int __C) +{ + return __builtin_ia32_vcvtpd2ph512_mask_round ((__v8df) __B, + _mm_setzero_ph (), + __A, __C); +} + +#else +#define _mm512_cvt_roundpd_ph(A, B) \ + (__builtin_ia32_vcvtpd2ph512_mask_round ((__v8df)(A), \ + _mm_setzero_ph (), \ + (__mmask8)-1, (B))) + +#define _mm512_mask_cvt_roundpd_ph(A, B, C, D) \ + (__builtin_ia32_vcvtpd2ph512_mask_round ((__v8df)(C), \ + (A), (B), (D))) + +#define _mm512_maskz_cvt_roundpd_ph(A, B, C) \ + (__builtin_ia32_vcvtpd2ph512_mask_round ((__v8df)(B), \ + _mm_setzero_ph (), \ + (A), (C))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtsh2ss, vcvtsh2sd. */ +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsh_ss (__m128 __A, __m128h __B) +{ + return __builtin_ia32_vcvtsh2ss_mask_round (__B, __A, + _mm_setzero_ps (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtsh_ss (__m128 __A, __mmask8 __B, __m128 __C, + __m128h __D) +{ + return __builtin_ia32_vcvtsh2ss_mask_round (__D, __C, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtsh_ss (__mmask8 __A, __m128 __B, + __m128h __C) +{ + return __builtin_ia32_vcvtsh2ss_mask_round (__C, __B, + _mm_setzero_ps (), + __A, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsh_sd (__m128d __A, __m128h __B) +{ + return __builtin_ia32_vcvtsh2sd_mask_round (__B, __A, + _mm_setzero_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtsh_sd (__m128d __A, __mmask8 __B, __m128d __C, + __m128h __D) +{ + return __builtin_ia32_vcvtsh2sd_mask_round (__D, __C, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtsh_sd (__mmask8 __A, __m128d __B, __m128h __C) +{ + return __builtin_ia32_vcvtsh2sd_mask_round (__C, __B, + _mm_setzero_pd (), + __A, _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsh_ss (__m128 __A, __m128h __B, const int __R) +{ + return __builtin_ia32_vcvtsh2ss_mask_round (__B, __A, + _mm_setzero_ps (), + (__mmask8) -1, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvt_roundsh_ss (__m128 __A, __mmask8 __B, __m128 __C, + __m128h __D, const int __R) +{ + return __builtin_ia32_vcvtsh2ss_mask_round (__D, __C, __A, __B, __R); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvt_roundsh_ss (__mmask8 __A, __m128 __B, + __m128h __C, const int __R) +{ + return __builtin_ia32_vcvtsh2ss_mask_round (__C, __B, + _mm_setzero_ps (), + __A, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsh_sd (__m128d __A, __m128h __B, const int __R) +{ + return __builtin_ia32_vcvtsh2sd_mask_round (__B, __A, + _mm_setzero_pd (), + (__mmask8) -1, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvt_roundsh_sd (__m128d __A, __mmask8 __B, __m128d __C, + __m128h __D, const int __R) +{ + return __builtin_ia32_vcvtsh2sd_mask_round (__D, __C, __A, __B, __R); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvt_roundsh_sd (__mmask8 __A, __m128d __B, __m128h __C, const int __R) +{ + return __builtin_ia32_vcvtsh2sd_mask_round (__C, __B, + _mm_setzero_pd (), + __A, __R); +} + +#else +#define _mm_cvt_roundsh_ss(A, B, R) \ + (__builtin_ia32_vcvtsh2ss_mask_round ((B), (A), \ + _mm_setzero_ps (), \ + (__mmask8) -1, (R))) + +#define _mm_mask_cvt_roundsh_ss(A, B, C, D, R) \ + (__builtin_ia32_vcvtsh2ss_mask_round ((D), (C), (A), (B), (R))) + +#define _mm_maskz_cvt_roundsh_ss(A, B, C, R) \ + (__builtin_ia32_vcvtsh2ss_mask_round ((C), (B), \ + _mm_setzero_ps (), \ + (A), (R))) + +#define _mm_cvt_roundsh_sd(A, B, R) \ + (__builtin_ia32_vcvtsh2sd_mask_round ((B), (A), \ + _mm_setzero_pd (), \ + (__mmask8) -1, (R))) + +#define _mm_mask_cvt_roundsh_sd(A, B, C, D, R) \ + (__builtin_ia32_vcvtsh2sd_mask_round ((D), (C), (A), (B), (R))) + +#define _mm_maskz_cvt_roundsh_sd(A, B, C, R) \ + (__builtin_ia32_vcvtsh2sd_mask_round ((C), (B), \ + _mm_setzero_pd (), \ + (A), (R))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtss2sh, vcvtsd2sh. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_sh (__m128h __A, __m128 __B) +{ + return __builtin_ia32_vcvtss2sh_mask_round (__B, __A, + _mm_setzero_ph (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtss_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128 __D) +{ + return __builtin_ia32_vcvtss2sh_mask_round (__D, __C, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtss_sh (__mmask8 __A, __m128h __B, __m128 __C) +{ + return __builtin_ia32_vcvtss2sh_mask_round (__C, __B, + _mm_setzero_ph (), + __A, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_sh (__m128h __A, __m128d __B) +{ + return __builtin_ia32_vcvtsd2sh_mask_round (__B, __A, + _mm_setzero_ph (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtsd_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128d __D) +{ + return __builtin_ia32_vcvtsd2sh_mask_round (__D, __C, __A, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtsd_sh (__mmask8 __A, __m128h __B, __m128d __C) +{ + return __builtin_ia32_vcvtsd2sh_mask_round (__C, __B, + _mm_setzero_ph (), + __A, _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundss_sh (__m128h __A, __m128 __B, const int __R) +{ + return __builtin_ia32_vcvtss2sh_mask_round (__B, __A, + _mm_setzero_ph (), + (__mmask8) -1, __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvt_roundss_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128 __D, + const int __R) +{ + return __builtin_ia32_vcvtss2sh_mask_round (__D, __C, __A, __B, __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvt_roundss_sh (__mmask8 __A, __m128h __B, __m128 __C, + const int __R) +{ + return __builtin_ia32_vcvtss2sh_mask_round (__C, __B, + _mm_setzero_ph (), + __A, __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_roundsd_sh (__m128h __A, __m128d __B, const int __R) +{ + return __builtin_ia32_vcvtsd2sh_mask_round (__B, __A, + _mm_setzero_ph (), + (__mmask8) -1, __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvt_roundsd_sh (__m128h __A, __mmask8 __B, __m128h __C, __m128d __D, + const int __R) +{ + return __builtin_ia32_vcvtsd2sh_mask_round (__D, __C, __A, __B, __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvt_roundsd_sh (__mmask8 __A, __m128h __B, __m128d __C, + const int __R) +{ + return __builtin_ia32_vcvtsd2sh_mask_round (__C, __B, + _mm_setzero_ph (), + __A, __R); +} + +#else +#define _mm_cvt_roundss_sh(A, B, R) \ + (__builtin_ia32_vcvtss2sh_mask_round ((B), (A), \ + _mm_setzero_ph (), \ + (__mmask8) -1, R)) + +#define _mm_mask_cvt_roundss_sh(A, B, C, D, R) \ + (__builtin_ia32_vcvtss2sh_mask_round ((D), (C), (A), (B), (R))) + +#define _mm_maskz_cvt_roundss_sh(A, B, C, R) \ + (__builtin_ia32_vcvtss2sh_mask_round ((C), (B), \ + _mm_setzero_ph (), \ + A, R)) + +#define _mm_cvt_roundsd_sh(A, B, R) \ + (__builtin_ia32_vcvtsd2sh_mask_round ((B), (A), \ + _mm_setzero_ph (), \ + (__mmask8) -1, R)) + +#define _mm_mask_cvt_roundsd_sh(A, B, C, D, R) \ + (__builtin_ia32_vcvtsd2sh_mask_round ((D), (C), (A), (B), (R))) + +#define _mm_maskz_cvt_roundsd_sh(A, B, C, R) \ + (__builtin_ia32_vcvtsd2sh_mask_round ((C), (B), \ + _mm_setzero_ph (), \ + (A), (R))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vfmaddsub[132,213,231]ph. */ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmaddsub_ph (__m512h __A, __m512h __B, __m512h __C) +{ + return (__m512h) + __builtin_ia32_vfmaddsubph512_mask ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmaddsub_ph (__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) +{ + return (__m512h) + __builtin_ia32_vfmaddsubph512_mask ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmaddsub_ph (__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) +{ + return (__m512h) + __builtin_ia32_vfmaddsubph512_mask3 ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmaddsub_ph (__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) +{ + return (__m512h) + __builtin_ia32_vfmaddsubph512_maskz ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmaddsub_round_ph (__m512h __A, __m512h __B, __m512h __C, const int __R) +{ + return (__m512h) + __builtin_ia32_vfmaddsubph512_mask ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) -1, __R); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmaddsub_round_ph (__m512h __A, __mmask32 __U, __m512h __B, + __m512h __C, const int __R) +{ + return (__m512h) + __builtin_ia32_vfmaddsubph512_mask ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, __R); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmaddsub_round_ph (__m512h __A, __m512h __B, __m512h __C, + __mmask32 __U, const int __R) +{ + return (__m512h) + __builtin_ia32_vfmaddsubph512_mask3 ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, __R); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmaddsub_round_ph (__mmask32 __U, __m512h __A, __m512h __B, + __m512h __C, const int __R) +{ + return (__m512h) + __builtin_ia32_vfmaddsubph512_maskz ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, __R); +} + +#else +#define _mm512_fmaddsub_round_ph(A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddsubph512_mask ((A), (B), (C), -1, (R))) + +#define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddsubph512_mask ((A), (B), (C), (U), (R))) + +#define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R) \ + ((__m512h)__builtin_ia32_vfmaddsubph512_mask3 ((A), (B), (C), (U), (R))) + +#define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddsubph512_maskz ((A), (B), (C), (U), (R))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vfmsubadd[132,213,231]ph. */ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_fmsubadd_ph (__m512h __A, __m512h __B, __m512h __C) +{ + return (__m512h) + __builtin_ia32_vfmsubaddph512_mask ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmsubadd_ph (__m512h __A, __mmask32 __U, + __m512h __B, __m512h __C) +{ + return (__m512h) + __builtin_ia32_vfmsubaddph512_mask ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmsubadd_ph (__m512h __A, __m512h __B, + __m512h __C, __mmask32 __U) +{ + return (__m512h) + __builtin_ia32_vfmsubaddph512_mask3 ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmsubadd_ph (__mmask32 __U, __m512h __A, + __m512h __B, __m512h __C) +{ + return (__m512h) + __builtin_ia32_vfmsubaddph512_maskz ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmsubadd_round_ph (__m512h __A, __m512h __B, + __m512h __C, const int __R) +{ + return (__m512h) + __builtin_ia32_vfmsubaddph512_mask ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) -1, __R); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmsubadd_round_ph (__m512h __A, __mmask32 __U, __m512h __B, + __m512h __C, const int __R) +{ + return (__m512h) + __builtin_ia32_vfmsubaddph512_mask ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, __R); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmsubadd_round_ph (__m512h __A, __m512h __B, __m512h __C, + __mmask32 __U, const int __R) +{ + return (__m512h) + __builtin_ia32_vfmsubaddph512_mask3 ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, __R); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmsubadd_round_ph (__mmask32 __U, __m512h __A, __m512h __B, + __m512h __C, const int __R) +{ + return (__m512h) + __builtin_ia32_vfmsubaddph512_maskz ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, __R); +} + +#else +#define _mm512_fmsubadd_round_ph(A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmsubaddph512_mask ((A), (B), (C), -1, (R))) + +#define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R) \ + ((__m512h)__builtin_ia32_vfmsubaddph512_mask ((A), (B), (C), (U), (R))) + +#define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R) \ + ((__m512h)__builtin_ia32_vfmsubaddph512_mask3 ((A), (B), (C), (U), (R))) + +#define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmsubaddph512_maskz ((A), (B), (C), (U), (R))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vfmadd[132,213,231]ph. */ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) + _mm512_fmadd_ph (__m512h __A, __m512h __B, __m512h __C) +{ + return (__m512h) + __builtin_ia32_vfmaddph512_mask ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmadd_ph (__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) +{ + return (__m512h) + __builtin_ia32_vfmaddph512_mask ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmadd_ph (__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) +{ + return (__m512h) + __builtin_ia32_vfmaddph512_mask3 ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmadd_ph (__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) +{ + return (__m512h) + __builtin_ia32_vfmaddph512_maskz ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmadd_round_ph (__m512h __A, __m512h __B, __m512h __C, const int __R) +{ + return (__m512h) __builtin_ia32_vfmaddph512_mask ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) -1, __R); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmadd_round_ph (__m512h __A, __mmask32 __U, __m512h __B, + __m512h __C, const int __R) +{ + return (__m512h) __builtin_ia32_vfmaddph512_mask ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, __R); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmadd_round_ph (__m512h __A, __m512h __B, __m512h __C, + __mmask32 __U, const int __R) +{ + return (__m512h) __builtin_ia32_vfmaddph512_mask3 ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, __R); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmadd_round_ph (__mmask32 __U, __m512h __A, __m512h __B, + __m512h __C, const int __R) +{ + return (__m512h) __builtin_ia32_vfmaddph512_maskz ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, __R); +} + +#else +#define _mm512_fmadd_round_ph(A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddph512_mask ((A), (B), (C), -1, (R))) + +#define _mm512_mask_fmadd_round_ph(A, U, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddph512_mask ((A), (B), (C), (U), (R))) + +#define _mm512_mask3_fmadd_round_ph(A, B, C, U, R) \ + ((__m512h)__builtin_ia32_vfmaddph512_mask3 ((A), (B), (C), (U), (R))) + +#define _mm512_maskz_fmadd_round_ph(U, A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddph512_maskz ((A), (B), (C), (U), (R))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vfnmadd[132,213,231]ph. */ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fnmadd_ph (__m512h __A, __m512h __B, __m512h __C) +{ + return (__m512h) + __builtin_ia32_vfnmaddph512_mask ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fnmadd_ph (__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) +{ + return (__m512h) + __builtin_ia32_vfnmaddph512_mask ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fnmadd_ph (__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) +{ + return (__m512h) + __builtin_ia32_vfnmaddph512_mask3 ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fnmadd_ph (__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) +{ + return (__m512h) + __builtin_ia32_vfnmaddph512_maskz ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fnmadd_round_ph (__m512h __A, __m512h __B, __m512h __C, const int __R) +{ + return (__m512h) __builtin_ia32_vfnmaddph512_mask ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) -1, __R); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fnmadd_round_ph (__m512h __A, __mmask32 __U, __m512h __B, + __m512h __C, const int __R) +{ + return (__m512h) __builtin_ia32_vfnmaddph512_mask ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, __R); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fnmadd_round_ph (__m512h __A, __m512h __B, __m512h __C, + __mmask32 __U, const int __R) +{ + return (__m512h) __builtin_ia32_vfnmaddph512_mask3 ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, __R); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fnmadd_round_ph (__mmask32 __U, __m512h __A, __m512h __B, + __m512h __C, const int __R) +{ + return (__m512h) __builtin_ia32_vfnmaddph512_maskz ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, __R); +} + +#else +#define _mm512_fnmadd_round_ph(A, B, C, R) \ + ((__m512h)__builtin_ia32_vfnmaddph512_mask ((A), (B), (C), -1, (R))) + +#define _mm512_mask_fnmadd_round_ph(A, U, B, C, R) \ + ((__m512h)__builtin_ia32_vfnmaddph512_mask ((A), (B), (C), (U), (R))) + +#define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R) \ + ((__m512h)__builtin_ia32_vfnmaddph512_mask3 ((A), (B), (C), (U), (R))) + +#define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R) \ + ((__m512h)__builtin_ia32_vfnmaddph512_maskz ((A), (B), (C), (U), (R))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vfmsub[132,213,231]ph. */ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmsub_ph (__m512h __A, __m512h __B, __m512h __C) +{ + return (__m512h) + __builtin_ia32_vfmsubph512_mask ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmsub_ph (__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) +{ + return (__m512h) + __builtin_ia32_vfmsubph512_mask ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmsub_ph (__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) +{ + return (__m512h) + __builtin_ia32_vfmsubph512_mask3 ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmsub_ph (__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) +{ + return (__m512h) + __builtin_ia32_vfmsubph512_maskz ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmsub_round_ph (__m512h __A, __m512h __B, __m512h __C, const int __R) +{ + return (__m512h) __builtin_ia32_vfmsubph512_mask ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) -1, __R); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmsub_round_ph (__m512h __A, __mmask32 __U, __m512h __B, + __m512h __C, const int __R) +{ + return (__m512h) __builtin_ia32_vfmsubph512_mask ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, __R); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmsub_round_ph (__m512h __A, __m512h __B, __m512h __C, + __mmask32 __U, const int __R) +{ + return (__m512h) __builtin_ia32_vfmsubph512_mask3 ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, __R); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmsub_round_ph (__mmask32 __U, __m512h __A, __m512h __B, + __m512h __C, const int __R) +{ + return (__m512h) __builtin_ia32_vfmsubph512_maskz ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, __R); +} + +#else +#define _mm512_fmsub_round_ph(A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmsubph512_mask ((A), (B), (C), -1, (R))) + +#define _mm512_mask_fmsub_round_ph(A, U, B, C, R) \ + ((__m512h)__builtin_ia32_vfmsubph512_mask ((A), (B), (C), (U), (R))) + +#define _mm512_mask3_fmsub_round_ph(A, B, C, U, R) \ + ((__m512h)__builtin_ia32_vfmsubph512_mask3 ((A), (B), (C), (U), (R))) + +#define _mm512_maskz_fmsub_round_ph(U, A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmsubph512_maskz ((A), (B), (C), (U), (R))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vfnmsub[132,213,231]ph. */ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fnmsub_ph (__m512h __A, __m512h __B, __m512h __C) +{ + return (__m512h) + __builtin_ia32_vfnmsubph512_mask ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fnmsub_ph (__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) +{ + return (__m512h) + __builtin_ia32_vfnmsubph512_mask ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fnmsub_ph (__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) +{ + return (__m512h) + __builtin_ia32_vfnmsubph512_mask3 ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fnmsub_ph (__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) +{ + return (__m512h) + __builtin_ia32_vfnmsubph512_maskz ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fnmsub_round_ph (__m512h __A, __m512h __B, __m512h __C, const int __R) +{ + return (__m512h) __builtin_ia32_vfnmsubph512_mask ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) -1, __R); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fnmsub_round_ph (__m512h __A, __mmask32 __U, __m512h __B, + __m512h __C, const int __R) +{ + return (__m512h) __builtin_ia32_vfnmsubph512_mask ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, __R); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fnmsub_round_ph (__m512h __A, __m512h __B, __m512h __C, + __mmask32 __U, const int __R) +{ + return (__m512h) __builtin_ia32_vfnmsubph512_mask3 ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, __R); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fnmsub_round_ph (__mmask32 __U, __m512h __A, __m512h __B, + __m512h __C, const int __R) +{ + return (__m512h) __builtin_ia32_vfnmsubph512_maskz ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + (__mmask32) __U, __R); +} + +#else +#define _mm512_fnmsub_round_ph(A, B, C, R) \ + ((__m512h)__builtin_ia32_vfnmsubph512_mask ((A), (B), (C), -1, (R))) + +#define _mm512_mask_fnmsub_round_ph(A, U, B, C, R) \ + ((__m512h)__builtin_ia32_vfnmsubph512_mask ((A), (B), (C), (U), (R))) + +#define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R) \ + ((__m512h)__builtin_ia32_vfnmsubph512_mask3 ((A), (B), (C), (U), (R))) + +#define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R) \ + ((__m512h)__builtin_ia32_vfnmsubph512_maskz ((A), (B), (C), (U), (R))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vfmadd[132,213,231]sh. */ +extern __inline __m128h + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmadd_sh (__m128h __W, __m128h __A, __m128h __B) +{ + return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W, + (__v8hf) __A, + (__v8hf) __B, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmadd_sh (__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) +{ + return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W, + (__v8hf) __A, + (__v8hf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmadd_sh (__m128h __W, __m128h __A, __m128h __B, __mmask8 __U) +{ + return (__m128h) __builtin_ia32_vfmaddsh3_mask3 ((__v8hf) __W, + (__v8hf) __A, + (__v8hf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmadd_sh (__mmask8 __U, __m128h __W, __m128h __A, __m128h __B) +{ + return (__m128h) __builtin_ia32_vfmaddsh3_maskz ((__v8hf) __W, + (__v8hf) __A, + (__v8hf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + + +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmadd_round_sh (__m128h __W, __m128h __A, __m128h __B, const int __R) +{ + return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W, + (__v8hf) __A, + (__v8hf) __B, + (__mmask8) -1, + __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmadd_round_sh (__m128h __W, __mmask8 __U, __m128h __A, __m128h __B, + const int __R) +{ + return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W, + (__v8hf) __A, + (__v8hf) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmadd_round_sh (__m128h __W, __m128h __A, __m128h __B, __mmask8 __U, + const int __R) +{ + return (__m128h) __builtin_ia32_vfmaddsh3_mask3 ((__v8hf) __W, + (__v8hf) __A, + (__v8hf) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmadd_round_sh (__mmask8 __U, __m128h __W, __m128h __A, + __m128h __B, const int __R) +{ + return (__m128h) __builtin_ia32_vfmaddsh3_maskz ((__v8hf) __W, + (__v8hf) __A, + (__v8hf) __B, + (__mmask8) __U, __R); +} + +#else +#define _mm_fmadd_round_sh(A, B, C, R) \ + ((__m128h) __builtin_ia32_vfmaddsh3_mask ((A), (B), (C), (-1), (R))) +#define _mm_mask_fmadd_round_sh(A, U, B, C, R) \ + ((__m128h) __builtin_ia32_vfmaddsh3_mask ((A), (B), (C), (U), (R))) +#define _mm_mask3_fmadd_round_sh(A, B, C, U, R) \ + ((__m128h) __builtin_ia32_vfmaddsh3_mask3 ((A), (B), (C), (U), (R))) +#define _mm_maskz_fmadd_round_sh(U, A, B, C, R) \ + ((__m128h) __builtin_ia32_vfmaddsh3_maskz ((A), (B), (C), (U), (R))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vfnmadd[132,213,231]sh. */ +extern __inline __m128h + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmadd_sh (__m128h __W, __m128h __A, __m128h __B) +{ + return (__m128h) __builtin_ia32_vfnmaddsh3_mask ((__v8hf) __W, + (__v8hf) __A, + (__v8hf) __B, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmadd_sh (__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) +{ + return (__m128h) __builtin_ia32_vfnmaddsh3_mask ((__v8hf) __W, + (__v8hf) __A, + (__v8hf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmadd_sh (__m128h __W, __m128h __A, __m128h __B, __mmask8 __U) +{ + return (__m128h) __builtin_ia32_vfnmaddsh3_mask3 ((__v8hf) __W, + (__v8hf) __A, + (__v8hf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmadd_sh (__mmask8 __U, __m128h __W, __m128h __A, __m128h __B) +{ + return (__m128h) __builtin_ia32_vfnmaddsh3_maskz ((__v8hf) __W, + (__v8hf) __A, + (__v8hf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + + +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmadd_round_sh (__m128h __W, __m128h __A, __m128h __B, const int __R) +{ + return (__m128h) __builtin_ia32_vfnmaddsh3_mask ((__v8hf) __W, + (__v8hf) __A, + (__v8hf) __B, + (__mmask8) -1, + __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmadd_round_sh (__m128h __W, __mmask8 __U, __m128h __A, __m128h __B, + const int __R) +{ + return (__m128h) __builtin_ia32_vfnmaddsh3_mask ((__v8hf) __W, + (__v8hf) __A, + (__v8hf) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmadd_round_sh (__m128h __W, __m128h __A, __m128h __B, __mmask8 __U, + const int __R) +{ + return (__m128h) __builtin_ia32_vfnmaddsh3_mask3 ((__v8hf) __W, + (__v8hf) __A, + (__v8hf) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmadd_round_sh (__mmask8 __U, __m128h __W, __m128h __A, + __m128h __B, const int __R) +{ + return (__m128h) __builtin_ia32_vfnmaddsh3_maskz ((__v8hf) __W, + (__v8hf) __A, + (__v8hf) __B, + (__mmask8) __U, __R); +} + +#else +#define _mm_fnmadd_round_sh(A, B, C, R) \ + ((__m128h) __builtin_ia32_vfnmaddsh3_mask ((A), (B), (C), (-1), (R))) +#define _mm_mask_fnmadd_round_sh(A, U, B, C, R) \ + ((__m128h) __builtin_ia32_vfnmaddsh3_mask ((A), (B), (C), (U), (R))) +#define _mm_mask3_fnmadd_round_sh(A, B, C, U, R) \ + ((__m128h) __builtin_ia32_vfnmaddsh3_mask3 ((A), (B), (C), (U), (R))) +#define _mm_maskz_fnmadd_round_sh(U, A, B, C, R) \ + ((__m128h) __builtin_ia32_vfnmaddsh3_maskz ((A), (B), (C), (U), (R))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vfmsub[132,213,231]sh. */ +extern __inline __m128h + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmsub_sh (__m128h __W, __m128h __A, __m128h __B) +{ + return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W, + (__v8hf) __A, + -(__v8hf) __B, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmsub_sh (__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) +{ + return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W, + (__v8hf) __A, + -(__v8hf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmsub_sh (__m128h __W, __m128h __A, __m128h __B, __mmask8 __U) +{ + return (__m128h) __builtin_ia32_vfmsubsh3_mask3 ((__v8hf) __W, + (__v8hf) __A, + (__v8hf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmsub_sh (__mmask8 __U, __m128h __W, __m128h __A, __m128h __B) +{ + return (__m128h) __builtin_ia32_vfmaddsh3_maskz ((__v8hf) __W, + (__v8hf) __A, + -(__v8hf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + + +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmsub_round_sh (__m128h __W, __m128h __A, __m128h __B, const int __R) +{ + return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W, + (__v8hf) __A, + -(__v8hf) __B, + (__mmask8) -1, + __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmsub_round_sh (__m128h __W, __mmask8 __U, __m128h __A, __m128h __B, + const int __R) +{ + return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W, + (__v8hf) __A, + -(__v8hf) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmsub_round_sh (__m128h __W, __m128h __A, __m128h __B, __mmask8 __U, + const int __R) +{ + return (__m128h) __builtin_ia32_vfmsubsh3_mask3 ((__v8hf) __W, + (__v8hf) __A, + (__v8hf) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmsub_round_sh (__mmask8 __U, __m128h __W, __m128h __A, + __m128h __B, const int __R) +{ + return (__m128h) __builtin_ia32_vfmaddsh3_maskz ((__v8hf) __W, + (__v8hf) __A, + -(__v8hf) __B, + (__mmask8) __U, __R); +} + +#else +#define _mm_fmsub_round_sh(A, B, C, R) \ + ((__m128h) __builtin_ia32_vfmaddsh3_mask ((A), (B), -(C), (-1), (R))) +#define _mm_mask_fmsub_round_sh(A, U, B, C, R) \ + ((__m128h) __builtin_ia32_vfmaddsh3_mask ((A), (B), -(C), (U), (R))) +#define _mm_mask3_fmsub_round_sh(A, B, C, U, R) \ + ((__m128h) __builtin_ia32_vfmsubsh3_mask3 ((A), (B), (C), (U), (R))) +#define _mm_maskz_fmsub_round_sh(U, A, B, C, R) \ + ((__m128h) __builtin_ia32_vfmaddsh3_maskz ((A), (B), -(C), (U), (R))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vfnmsub[132,213,231]sh. */ +extern __inline __m128h + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmsub_sh (__m128h __W, __m128h __A, __m128h __B) +{ + return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W, + -(__v8hf) __A, + -(__v8hf) __B, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmsub_sh (__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) +{ + return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W, + -(__v8hf) __A, + -(__v8hf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmsub_sh (__m128h __W, __m128h __A, __m128h __B, __mmask8 __U) +{ + return (__m128h) __builtin_ia32_vfmsubsh3_mask3 ((__v8hf) __W, + -(__v8hf) __A, + (__v8hf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmsub_sh (__mmask8 __U, __m128h __W, __m128h __A, __m128h __B) +{ + return (__m128h) __builtin_ia32_vfmaddsh3_maskz ((__v8hf) __W, + -(__v8hf) __A, + -(__v8hf) __B, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + + +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmsub_round_sh (__m128h __W, __m128h __A, __m128h __B, const int __R) +{ + return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W, + -(__v8hf) __A, + -(__v8hf) __B, + (__mmask8) -1, + __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmsub_round_sh (__m128h __W, __mmask8 __U, __m128h __A, __m128h __B, + const int __R) +{ + return (__m128h) __builtin_ia32_vfmaddsh3_mask ((__v8hf) __W, + -(__v8hf) __A, + -(__v8hf) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmsub_round_sh (__m128h __W, __m128h __A, __m128h __B, __mmask8 __U, + const int __R) +{ + return (__m128h) __builtin_ia32_vfmsubsh3_mask3 ((__v8hf) __W, + -(__v8hf) __A, + (__v8hf) __B, + (__mmask8) __U, __R); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmsub_round_sh (__mmask8 __U, __m128h __W, __m128h __A, + __m128h __B, const int __R) +{ + return (__m128h) __builtin_ia32_vfmaddsh3_maskz ((__v8hf) __W, + -(__v8hf) __A, + -(__v8hf) __B, + (__mmask8) __U, __R); +} + +#else +#define _mm_fnmsub_round_sh(A, B, C, R) \ + ((__m128h) __builtin_ia32_vfmaddsh3_mask ((A), -(B), -(C), (-1), (R))) +#define _mm_mask_fnmsub_round_sh(A, U, B, C, R) \ + ((__m128h) __builtin_ia32_vfmaddsh3_mask ((A), -(B), -(C), (U), (R))) +#define _mm_mask3_fnmsub_round_sh(A, B, C, U, R) \ + ((__m128h) __builtin_ia32_vfmsubsh3_mask3 ((A), -(B), (C), (U), (R))) +#define _mm_maskz_fnmsub_round_sh(U, A, B, C, R) \ + ((__m128h) __builtin_ia32_vfmaddsh3_maskz ((A), -(B), -(C), (U), (R))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vf[,c]maddcph. */ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fcmadd_pch (__m512h __A, __m512h __B, __m512h __C) +{ + return (__m512h) + __builtin_ia32_vfcmaddcph512_round ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fcmadd_pch (__m512h __A, __mmask16 __B, __m512h __C, __m512h __D) +{ + return (__m512h) + __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) __A, + (__v32hf) __C, + (__v32hf) __D, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fcmadd_pch (__m512h __A, __m512h __B, __m512h __C, __mmask16 __D) +{ + return (__m512h) + __builtin_ia32_vfcmaddcph512_mask3_round ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + __D, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fcmadd_pch (__mmask16 __A, __m512h __B, __m512h __C, __m512h __D) +{ + return (__m512h) + __builtin_ia32_vfcmaddcph512_maskz_round ((__v32hf) __B, + (__v32hf) __C, + (__v32hf) __D, + __A, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmadd_pch (__m512h __A, __m512h __B, __m512h __C) +{ + return (__m512h) + __builtin_ia32_vfmaddcph512_round ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmadd_pch (__m512h __A, __mmask16 __B, __m512h __C, __m512h __D) +{ + return (__m512h) + __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) __A, + (__v32hf) __C, + (__v32hf) __D, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmadd_pch (__m512h __A, __m512h __B, __m512h __C, __mmask16 __D) +{ + return (__m512h) + __builtin_ia32_vfmaddcph512_mask3_round ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + __D, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmadd_pch (__mmask16 __A, __m512h __B, __m512h __C, __m512h __D) +{ + return (__m512h) + __builtin_ia32_vfmaddcph512_maskz_round ((__v32hf) __B, + (__v32hf) __C, + (__v32hf) __D, + __A, _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fcmadd_round_pch (__m512h __A, __m512h __B, __m512h __C, const int __D) +{ + return (__m512h) + __builtin_ia32_vfcmaddcph512_round ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + __D); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fcmadd_round_pch (__m512h __A, __mmask16 __B, __m512h __C, + __m512h __D, const int __E) +{ + return (__m512h) + __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) __A, + (__v32hf) __C, + (__v32hf) __D, __B, + __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fcmadd_round_pch (__m512h __A, __m512h __B, __m512h __C, + __mmask16 __D, const int __E) +{ + return (__m512h) + __builtin_ia32_vfcmaddcph512_mask3_round ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + __D, __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fcmadd_round_pch (__mmask16 __A, __m512h __B, __m512h __C, + __m512h __D, const int __E) +{ + return (__m512h) + __builtin_ia32_vfcmaddcph512_maskz_round ((__v32hf) __B, + (__v32hf) __C, + (__v32hf) __D, + __A, __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmadd_round_pch (__m512h __A, __m512h __B, __m512h __C, const int __D) +{ + return (__m512h) + __builtin_ia32_vfmaddcph512_round ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + __D); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmadd_round_pch (__m512h __A, __mmask16 __B, __m512h __C, + __m512h __D, const int __E) +{ + return (__m512h) + __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) __A, + (__v32hf) __C, + (__v32hf) __D, __B, + __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask3_fmadd_round_pch (__m512h __A, __m512h __B, __m512h __C, + __mmask16 __D, const int __E) +{ + return (__m512h) + __builtin_ia32_vfmaddcph512_mask3_round ((__v32hf) __A, + (__v32hf) __B, + (__v32hf) __C, + __D, __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmadd_round_pch (__mmask16 __A, __m512h __B, __m512h __C, + __m512h __D, const int __E) +{ + return (__m512h) + __builtin_ia32_vfmaddcph512_maskz_round ((__v32hf) __B, + (__v32hf) __C, + (__v32hf) __D, + __A, __E); +} + +#else +#define _mm512_fcmadd_round_pch(A, B, C, D) \ + (__m512h) __builtin_ia32_vfcmaddcph512_round ((A), (B), (C), (D)) + +#define _mm512_mask_fcmadd_round_pch(A, B, C, D, E) \ + ((__m512h) \ + __builtin_ia32_vfcmaddcph512_mask_round ((__v32hf) (A), \ + (__v32hf) (C), \ + (__v32hf) (D), \ + (B), (E))) + + +#define _mm512_mask3_fcmadd_round_pch(A, B, C, D, E) \ + ((__m512h) \ + __builtin_ia32_vfcmaddcph512_mask3_round ((A), (B), (C), (D), (E))) + +#define _mm512_maskz_fcmadd_round_pch(A, B, C, D, E) \ + (__m512h) \ + __builtin_ia32_vfcmaddcph512_maskz_round ((B), (C), (D), (A), (E)) + +#define _mm512_fmadd_round_pch(A, B, C, D) \ + (__m512h) __builtin_ia32_vfmaddcph512_round ((A), (B), (C), (D)) + +#define _mm512_mask_fmadd_round_pch(A, B, C, D, E) \ + ((__m512h) \ + __builtin_ia32_vfmaddcph512_mask_round ((__v32hf) (A), \ + (__v32hf) (C), \ + (__v32hf) (D), \ + (B), (E))) + +#define _mm512_mask3_fmadd_round_pch(A, B, C, D, E) \ + (__m512h) \ + __builtin_ia32_vfmaddcph512_mask3_round ((A), (B), (C), (D), (E)) + +#define _mm512_maskz_fmadd_round_pch(A, B, C, D, E) \ + (__m512h) \ + __builtin_ia32_vfmaddcph512_maskz_round ((B), (C), (D), (A), (E)) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vf[,c]mulcph. */ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fcmul_pch (__m512h __A, __m512h __B) +{ + return (__m512h) + __builtin_ia32_vfcmulcph512_round ((__v32hf) __A, + (__v32hf) __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fcmul_pch (__m512h __A, __mmask16 __B, __m512h __C, __m512h __D) +{ + return (__m512h) + __builtin_ia32_vfcmulcph512_mask_round ((__v32hf) __C, + (__v32hf) __D, + (__v32hf) __A, + __B, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fcmul_pch (__mmask16 __A, __m512h __B, __m512h __C) +{ + return (__m512h) + __builtin_ia32_vfcmulcph512_mask_round ((__v32hf) __B, + (__v32hf) __C, + _mm512_setzero_ph (), + __A, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmul_pch (__m512h __A, __m512h __B) +{ + return (__m512h) + __builtin_ia32_vfmulcph512_round ((__v32hf) __A, + (__v32hf) __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmul_pch (__m512h __A, __mmask16 __B, __m512h __C, __m512h __D) +{ + return (__m512h) + __builtin_ia32_vfmulcph512_mask_round ((__v32hf) __C, + (__v32hf) __D, + (__v32hf) __A, + __B, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmul_pch (__mmask16 __A, __m512h __B, __m512h __C) +{ + return (__m512h) + __builtin_ia32_vfmulcph512_mask_round ((__v32hf) __B, + (__v32hf) __C, + _mm512_setzero_ph (), + __A, _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fcmul_round_pch (__m512h __A, __m512h __B, const int __D) +{ + return (__m512h) + __builtin_ia32_vfcmulcph512_round ((__v32hf) __A, + (__v32hf) __B, __D); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fcmul_round_pch (__m512h __A, __mmask16 __B, __m512h __C, + __m512h __D, const int __E) +{ + return (__m512h) + __builtin_ia32_vfcmulcph512_mask_round ((__v32hf) __C, + (__v32hf) __D, + (__v32hf) __A, + __B, __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fcmul_round_pch (__mmask16 __A, __m512h __B, + __m512h __C, const int __E) +{ + return (__m512h) + __builtin_ia32_vfcmulcph512_mask_round ((__v32hf) __B, + (__v32hf) __C, + _mm512_setzero_ph (), + __A, __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_fmul_round_pch (__m512h __A, __m512h __B, const int __D) +{ + return (__m512h) + __builtin_ia32_vfmulcph512_round ((__v32hf) __A, + (__v32hf) __B, + __D); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_fmul_round_pch (__m512h __A, __mmask16 __B, __m512h __C, + __m512h __D, const int __E) +{ + return (__m512h) + __builtin_ia32_vfmulcph512_mask_round ((__v32hf) __C, + (__v32hf) __D, + (__v32hf) __A, + __B, __E); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_fmul_round_pch (__mmask16 __A, __m512h __B, + __m512h __C, const int __E) +{ + return (__m512h) + __builtin_ia32_vfmulcph512_mask_round ((__v32hf) __B, + (__v32hf) __C, + _mm512_setzero_ph (), + __A, __E); +} + +#else +#define _mm512_fcmul_round_pch(A, B, D) \ + (__m512h) __builtin_ia32_vfcmulcph512_round ((A), (B), (D)) + +#define _mm512_mask_fcmul_round_pch(A, B, C, D, E) \ + (__m512h) __builtin_ia32_vfcmulcph512_mask_round ((C), (D), (A), (B), (E)) + +#define _mm512_maskz_fcmul_round_pch(A, B, C, E) \ + (__m512h) __builtin_ia32_vfcmulcph512_mask_round ((B), (C), \ + (__v32hf) \ + _mm512_setzero_ph (), \ + (A), (E)) + +#define _mm512_fmul_round_pch(A, B, D) \ + (__m512h) __builtin_ia32_vfmulcph512_round ((A), (B), (D)) + +#define _mm512_mask_fmul_round_pch(A, B, C, D, E) \ + (__m512h) __builtin_ia32_vfmulcph512_mask_round ((C), (D), (A), (B), (E)) + +#define _mm512_maskz_fmul_round_pch(A, B, C, E) \ + (__m512h) __builtin_ia32_vfmulcph512_mask_round ((B), (C), \ + (__v32hf) \ + _mm512_setzero_ph (), \ + (A), (E)) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vf[,c]maddcsh. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fcmadd_sch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return (__m128h) + __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A, + (__v8hf) __C, + (__v8hf) __D, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fcmadd_sch (__m128h __A, __m128h __B, __m128h __C, __mmask8 __D) +{ + return (__m128h) + __builtin_ia32_vfcmaddcsh_mask3_round ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, __D, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fcmadd_sch (__mmask8 __A, __m128h __B, __m128h __C, __m128h __D) +{ + return (__m128h) + __builtin_ia32_vfcmaddcsh_maskz_round ((__v8hf) __B, + (__v8hf) __C, + (__v8hf) __D, + __A, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fcmadd_sch (__m128h __A, __m128h __B, __m128h __C) +{ + return (__m128h) + __builtin_ia32_vfcmaddcsh_round ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmadd_sch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return (__m128h) + __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A, + (__v8hf) __C, + (__v8hf) __D, __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmadd_sch (__m128h __A, __m128h __B, __m128h __C, __mmask8 __D) +{ + return (__m128h) + __builtin_ia32_vfmaddcsh_mask3_round ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, __D, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmadd_sch (__mmask8 __A, __m128h __B, __m128h __C, __m128h __D) +{ + return (__m128h) + __builtin_ia32_vfmaddcsh_maskz_round ((__v8hf) __B, + (__v8hf) __C, + (__v8hf) __D, + __A, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmadd_sch (__m128h __A, __m128h __B, __m128h __C) +{ + return (__m128h) + __builtin_ia32_vfmaddcsh_round ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fcmadd_round_sch (__m128h __A, __mmask8 __B, __m128h __C, + __m128h __D, const int __E) +{ + return (__m128h) + __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) __A, + (__v8hf) __C, + (__v8hf) __D, + __B, __E); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fcmadd_round_sch (__m128h __A, __m128h __B, __m128h __C, + __mmask8 __D, const int __E) +{ + return (__m128h) + __builtin_ia32_vfcmaddcsh_mask3_round ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + __D, __E); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fcmadd_round_sch (__mmask8 __A, __m128h __B, __m128h __C, + __m128h __D, const int __E) +{ + return (__m128h) + __builtin_ia32_vfcmaddcsh_maskz_round ((__v8hf) __B, + (__v8hf) __C, + (__v8hf) __D, + __A, __E); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fcmadd_round_sch (__m128h __A, __m128h __B, __m128h __C, const int __D) +{ + return (__m128h) + __builtin_ia32_vfcmaddcsh_round ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + __D); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmadd_round_sch (__m128h __A, __mmask8 __B, __m128h __C, + __m128h __D, const int __E) +{ + return (__m128h) + __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) __A, + (__v8hf) __C, + (__v8hf) __D, + __B, __E); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmadd_round_sch (__m128h __A, __m128h __B, __m128h __C, + __mmask8 __D, const int __E) +{ + return (__m128h) + __builtin_ia32_vfmaddcsh_mask3_round ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + __D, __E); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmadd_round_sch (__mmask8 __A, __m128h __B, __m128h __C, + __m128h __D, const int __E) +{ + return (__m128h) + __builtin_ia32_vfmaddcsh_maskz_round ((__v8hf) __B, + (__v8hf) __C, + (__v8hf) __D, + __A, __E); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmadd_round_sch (__m128h __A, __m128h __B, __m128h __C, const int __D) +{ + return (__m128h) + __builtin_ia32_vfmaddcsh_round ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + __D); +} +#else +#define _mm_mask_fcmadd_round_sch(A, B, C, D, E) \ + ((__m128h) \ + __builtin_ia32_vfcmaddcsh_mask_round ((__v8hf) (A), \ + (__v8hf) (C), \ + (__v8hf) (D), \ + (B), (E))) + + +#define _mm_mask3_fcmadd_round_sch(A, B, C, D, E) \ + ((__m128h) \ + __builtin_ia32_vfcmaddcsh_mask3_round ((__v8hf) (A), \ + (__v8hf) (B), \ + (__v8hf) (C), \ + (D), (E))) + +#define _mm_maskz_fcmadd_round_sch(A, B, C, D, E) \ + __builtin_ia32_vfcmaddcsh_maskz_round ((B), (C), (D), (A), (E)) + +#define _mm_fcmadd_round_sch(A, B, C, D) \ + __builtin_ia32_vfcmaddcsh_round ((A), (B), (C), (D)) + +#define _mm_mask_fmadd_round_sch(A, B, C, D, E) \ + ((__m128h) \ + __builtin_ia32_vfmaddcsh_mask_round ((__v8hf) (A), \ + (__v8hf) (C), \ + (__v8hf) (D), \ + (B), (E))) + +#define _mm_mask3_fmadd_round_sch(A, B, C, D, E) \ + ((__m128h) \ + __builtin_ia32_vfmaddcsh_mask3_round ((__v8hf) (A), \ + (__v8hf) (B), \ + (__v8hf) (C), \ + (D), (E))) + +#define _mm_maskz_fmadd_round_sch(A, B, C, D, E) \ + __builtin_ia32_vfmaddcsh_maskz_round ((B), (C), (D), (A), (E)) + +#define _mm_fmadd_round_sch(A, B, C, D) \ + __builtin_ia32_vfmaddcsh_round ((A), (B), (C), (D)) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vf[,c]mulcsh. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fcmul_sch (__m128h __A, __m128h __B) +{ + return (__m128h) + __builtin_ia32_vfcmulcsh_round ((__v8hf) __A, + (__v8hf) __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fcmul_sch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return (__m128h) + __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __C, + (__v8hf) __D, + (__v8hf) __A, + __B, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fcmul_sch (__mmask8 __A, __m128h __B, __m128h __C) +{ + return (__m128h) + __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __B, + (__v8hf) __C, + _mm_setzero_ph (), + __A, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmul_sch (__m128h __A, __m128h __B) +{ + return (__m128h) + __builtin_ia32_vfmulcsh_round ((__v8hf) __A, + (__v8hf) __B, + _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmul_sch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return (__m128h) + __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __C, + (__v8hf) __D, + (__v8hf) __A, + __B, _MM_FROUND_CUR_DIRECTION); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmul_sch (__mmask8 __A, __m128h __B, __m128h __C) +{ + return (__m128h) + __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __B, + (__v8hf) __C, + _mm_setzero_ph (), + __A, _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fcmul_round_sch (__m128h __A, __m128h __B, const int __D) +{ + return (__m128h) + __builtin_ia32_vfcmulcsh_round ((__v8hf) __A, + (__v8hf) __B, + __D); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fcmul_round_sch (__m128h __A, __mmask8 __B, __m128h __C, + __m128h __D, const int __E) +{ + return (__m128h) + __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __C, + (__v8hf) __D, + (__v8hf) __A, + __B, __E); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fcmul_round_sch (__mmask8 __A, __m128h __B, __m128h __C, + const int __E) +{ + return (__m128h) + __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __B, + (__v8hf) __C, + _mm_setzero_ph (), + __A, __E); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmul_round_sch (__m128h __A, __m128h __B, const int __D) +{ + return (__m128h) + __builtin_ia32_vfmulcsh_round ((__v8hf) __A, + (__v8hf) __B, __D); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmul_round_sch (__m128h __A, __mmask8 __B, __m128h __C, + __m128h __D, const int __E) +{ + return (__m128h) + __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __C, + (__v8hf) __D, + (__v8hf) __A, + __B, __E); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmul_round_sch (__mmask8 __A, __m128h __B, __m128h __C, const int __E) +{ + return (__m128h) + __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __B, + (__v8hf) __C, + _mm_setzero_ph (), + __A, __E); +} + +#else +#define _mm_fcmul_round_sch(__A, __B, __D) \ + (__m128h) __builtin_ia32_vfcmulcsh_round ((__v8hf) __A, \ + (__v8hf) __B, __D) + +#define _mm_mask_fcmul_round_sch(__A, __B, __C, __D, __E) \ + (__m128h) __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __C, \ + (__v8hf) __D, \ + (__v8hf) __A, \ + __B, __E) + +#define _mm_maskz_fcmul_round_sch(__A, __B, __C, __E) \ + (__m128h) __builtin_ia32_vfcmulcsh_mask_round ((__v8hf) __B, \ + (__v8hf) __C, \ + _mm_setzero_ph (), \ + __A, __E) + +#define _mm_fmul_round_sch(__A, __B, __D) \ + (__m128h) __builtin_ia32_vfmulcsh_round ((__v8hf) __A, \ + (__v8hf) __B, __D) + +#define _mm_mask_fmul_round_sch(__A, __B, __C, __D, __E) \ + (__m128h) __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __C, \ + (__v8hf) __D, \ + (__v8hf) __A, \ + __B, __E) + +#define _mm_maskz_fmul_round_sch(__A, __B, __C, __E) \ + (__m128h) __builtin_ia32_vfmulcsh_mask_round ((__v8hf) __B, \ + (__v8hf) __C, \ + _mm_setzero_ph (), \ + __A, __E) + +#endif /* __OPTIMIZE__ */ + +#define _MM512_REDUCE_OP(op) \ + __m256h __T1 = (__m256h) _mm512_extractf64x4_pd ((__m512d) __A, 0); \ + __m256h __T2 = (__m256h) _mm512_extractf64x4_pd ((__m512d) __A, 1); \ + __m256h __T3 = (__T1 op __T2); \ + __m128h __T4 = (__m128h) _mm256_extractf128_pd ((__m256d) __T3, 0); \ + __m128h __T5 = (__m128h) _mm256_extractf128_pd ((__m256d) __T3, 1); \ + __m128h __T6 = (__T4 op __T5); \ + __m128h __T7 = (__m128h) __builtin_shuffle ((__m128h)__T6, \ + (__v8hi) { 4, 5, 6, 7, 0, 1, 2, 3 }); \ + __m128h __T8 = (__T6 op __T7); \ + __m128h __T9 = (__m128h) __builtin_shuffle ((__m128h)__T8, \ + (__v8hi) { 2, 3, 0, 1, 4, 5, 6, 7 }); \ + __m128h __T10 = __T8 op __T9; \ + return __T10[0] op __T10[1] + +// TODO reduce +extern __inline _Float16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_add_ph (__m512h __A) +{ + _MM512_REDUCE_OP (+); +} + +extern __inline _Float16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_mul_ph (__m512h __A) +{ + _MM512_REDUCE_OP (*); +} + +#undef _MM512_REDUCE_OP + +#ifdef __AVX512VL__ + +#define _MM512_REDUCE_OP(op) \ + __m256h __T1 = (__m256h) _mm512_extractf64x4_pd ((__m512d) __A, 0); \ + __m256h __T2 = (__m256h) _mm512_extractf64x4_pd ((__m512d) __A, 1); \ + __m256h __T3 = __builtin_ia32_##op##ph256_mask (__T1, __T2, \ + _mm256_setzero_ph (), (__mmask16) -1); \ + __m128h __T4 = (__m128h) _mm256_extractf128_pd ((__m256d) __T3, 0); \ + __m128h __T5 = (__m128h) _mm256_extractf128_pd ((__m256d) __T3, 1); \ + __m128h __T6 = __builtin_ia32_##op##ph128_mask \ + (__T4, __T5, _mm_setzero_ph (),(__mmask8) -1); \ + __m128h __T7 = (__m128h) __builtin_shuffle ((__m128h)__T6, \ + (__v8hi) { 2, 3, 0, 1, 6, 7, 4, 5 }); \ + __m128h __T8 = (__m128h) __builtin_ia32_##op##ph128_mask \ + (__T6, __T7, _mm_setzero_ph (),(__mmask8) -1); \ + __m128h __T9 = (__m128h) __builtin_shuffle ((__m128h)__T8, \ + (__v8hi) { 4, 5 }); \ + __m128h __T10 = __builtin_ia32_##op##ph128_mask \ + (__T8, __T9, _mm_setzero_ph (),(__mmask8) -1); \ + __m128h __T11 = (__m128h) __builtin_shuffle (__T10, \ + (__v8hi) { 1, 0 }); \ + __m128h __T12 = __builtin_ia32_##op##ph128_mask \ + (__T10, __T11, _mm_setzero_ph (),(__mmask8) -1); \ + return __T12[0] + +#else + +#define _MM512_REDUCE_OP(op) \ + __m512h __T1 = (__m512h) __builtin_shuffle ((__m512d) __A, \ + (__v8di) { 4, 5, 6, 7, 0, 0, 0, 0 }); \ + __m512h __T2 = _mm512_##op##_ph (__A, __T1); \ + __m512h __T3 = (__m512h) __builtin_shuffle ((__m512d) __T2, \ + (__v8di) { 2, 3, 0, 0, 0, 0, 0, 0 }); \ + __m512h __T4 = _mm512_##op##_ph (__T2, __T3); \ + __m512h __T5 = (__m512h) __builtin_shuffle ((__m512d) __T4, \ + (__v8di) { 1, 0, 0, 0, 0, 0, 0, 0 }); \ + __m512h __T6 = _mm512_##op##_ph (__T4, __T5); \ + __m512h __T7 = (__m512h) __builtin_shuffle ((__m512) __T6, \ + (__v16si) { 1, 0, 0, 0, 0, 0, 0, 0, \ + 0, 0, 0, 0, 0, 0, 0, 0 }); \ + __m512h __T8 = _mm512_##op##_ph (__T6, __T7); \ + __m512h __T9 = (__m512h) __builtin_shuffle (__T8, \ + (__v32hi) { 1, 0, 0, 0, 0, 0, 0, 0, \ + 0, 0, 0, 0, 0, 0, 0, 0, \ + 0, 0, 0, 0, 0, 0, 0, 0, \ + 0, 0, 0, 0, 0, 0, 0, 0 }); \ + __m512h __T10 = _mm512_##op##_ph (__T8, __T9); \ + return __T10[0] +#endif + +extern __inline _Float16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_min_ph (__m512h __A) +{ + _MM512_REDUCE_OP (min); +} + +extern __inline _Float16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_reduce_max_ph (__m512h __A) +{ + _MM512_REDUCE_OP (max); +} + +#undef _MM512_REDUCE_OP + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_blend_ph (__mmask32 __U, __m512h __A, __m512h __W) +{ + return (__m512h) __builtin_ia32_movdquhi512_mask ((__v32hi) __W, + (__v32hi) __A, + (__mmask32) __U); + +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutex2var_ph (__m512h __A, __m512i __I, __m512h __B) +{ + return (__m512h) __builtin_ia32_vpermi2varhi512_mask ((__v32hi) __A, + (__v32hi) __I, + (__v32hi) __B, + (__mmask32)-1); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutexvar_ph (__m512i __A, __m512h __B) +{ + return (__m512h) __builtin_ia32_permvarhi512_mask ((__v32hi) __B, + (__v32hi) __A, + (__v32hi) + (_mm512_setzero_ph ()), + (__mmask32)-1); +} + +extern __inline __m512h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_set1_pch (_Float16 _Complex __A) +{ + union + { + _Float16 _Complex __a; + float __b; + } __u = { .__a = __A}; + + return (__m512h) _mm512_set1_ps (__u.__b); +} + +// intrinsics below are alias for f*mul_*ch +#define _mm512_mul_pch(A, B) _mm512_fmul_pch ((A), (B)) +#define _mm512_mask_mul_pch(W, U, A, B) \ + _mm512_mask_fmul_pch ((W), (U), (A), (B)) +#define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch ((U), (A), (B)) +#define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch ((A), (B), (R)) +#define _mm512_mask_mul_round_pch(W, U, A, B, R) \ + _mm512_mask_fmul_round_pch ((W), (U), (A), (B), (R)) +#define _mm512_maskz_mul_round_pch(U, A, B, R) \ + _mm512_maskz_fmul_round_pch ((U), (A), (B), (R)) + +#define _mm512_cmul_pch(A, B) _mm512_fcmul_pch ((A), (B)) +#define _mm512_mask_cmul_pch(W, U, A, B) \ + _mm512_mask_fcmul_pch ((W), (U), (A), (B)) +#define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch ((U), (A), (B)) +#define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch ((A), (B), (R)) +#define _mm512_mask_cmul_round_pch(W, U, A, B, R) \ + _mm512_mask_fcmul_round_pch ((W), (U), (A), (B), (R)) +#define _mm512_maskz_cmul_round_pch(U, A, B, R) \ + _mm512_maskz_fcmul_round_pch ((U), (A), (B), (R)) + +#define _mm_mul_sch(A, B) _mm_fmul_sch ((A), (B)) +#define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch ((W), (U), (A), (B)) +#define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch ((U), (A), (B)) +#define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch ((A), (B), (R)) +#define _mm_mask_mul_round_sch(W, U, A, B, R) \ + _mm_mask_fmul_round_sch ((W), (U), (A), (B), (R)) +#define _mm_maskz_mul_round_sch(U, A, B, R) \ + _mm_maskz_fmul_round_sch ((U), (A), (B), (R)) + +#define _mm_cmul_sch(A, B) _mm_fcmul_sch ((A), (B)) +#define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch ((W), (U), (A), (B)) +#define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch ((U), (A), (B)) +#define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch ((A), (B), (R)) +#define _mm_mask_cmul_round_sch(W, U, A, B, R) \ + _mm_mask_fcmul_round_sch ((W), (U), (A), (B), (R)) +#define _mm_maskz_cmul_round_sch(U, A, B, R) \ + _mm_maskz_fcmul_round_sch ((U), (A), (B), (R)) + +#ifdef __DISABLE_AVX512FP16__ +#undef __DISABLE_AVX512FP16__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512FP16__ */ + +#endif /* __AVX512FP16INTRIN_H_INCLUDED */ diff --git a/include-gcc/avx512fp16vlintrin.h b/include-gcc/avx512fp16vlintrin.h new file mode 100644 index 0000000..308b0b2 --- /dev/null +++ b/include-gcc/avx512fp16vlintrin.h @@ -0,0 +1,3362 @@ +/* Copyright (C) 2019-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512FP16VLINTRIN_H_INCLUDED +#define __AVX512FP16VLINTRIN_H_INCLUDED + +#if !defined(__AVX512VL__) || !defined(__AVX512FP16__) +#pragma GCC push_options +#pragma GCC target("avx512fp16,avx512vl") +#define __DISABLE_AVX512FP16VL__ +#endif /* __AVX512FP16VL__ */ + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castph_ps (__m128h __a) +{ + return (__m128) __a; +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castph_ps (__m256h __a) +{ + return (__m256) __a; +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castph_pd (__m128h __a) +{ + return (__m128d) __a; +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castph_pd (__m256h __a) +{ + return (__m256d) __a; +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castph_si128 (__m128h __a) +{ + return (__m128i) __a; +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castph_si256 (__m256h __a) +{ + return (__m256i) __a; +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castps_ph (__m128 __a) +{ + return (__m128h) __a; +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castps_ph (__m256 __a) +{ + return (__m256h) __a; +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castpd_ph (__m128d __a) +{ + return (__m128h) __a; +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castpd_ph (__m256d __a) +{ + return (__m256h) __a; +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castsi128_ph (__m128i __a) +{ + return (__m128h) __a; +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castsi256_ph (__m256i __a) +{ + return (__m256h) __a; +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castph256_ph128 (__m256h __A) +{ + union + { + __m128h __a[2]; + __m256h __v; + } __u = { .__v = __A }; + return __u.__a[0]; +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castph128_ph256 (__m128h __A) +{ + union + { + __m128h __a[2]; + __m256h __v; + } __u; + __u.__a[0] = __A; + return __u.__v; +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_zextph128_ph256 (__m128h __A) +{ + return (__m256h) _mm256_insertf128_ps (_mm256_setzero_ps (), + (__m128) __A, 0); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_conj_pch (__m256h __A) +{ + return (__m256h) _mm256_xor_epi32 ((__m256i) __A, _mm256_set1_epi32 (1<<31)); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_conj_pch (__m256h __W, __mmask8 __U, __m256h __A) +{ + return (__m256h) __builtin_ia32_movaps256_mask ((__v8sf) + _mm256_conj_pch (__A), + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_conj_pch (__mmask8 __U, __m256h __A) +{ + return (__m256h) __builtin_ia32_movaps256_mask ((__v8sf) + _mm256_conj_pch (__A), + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_conj_pch (__m128h __A) +{ + return (__m128h) _mm_xor_epi32 ((__m128i) __A, _mm_set1_epi32 (1<<31)); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_conj_pch (__m128h __W, __mmask8 __U, __m128h __A) +{ + return (__m128h) __builtin_ia32_movaps128_mask ((__v4sf) _mm_conj_pch (__A), + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_conj_pch (__mmask8 __U, __m128h __A) +{ + return (__m128h) __builtin_ia32_movaps128_mask ((__v4sf) _mm_conj_pch (__A), + (__v4sf) _mm_setzero_ps (), + (__mmask8) __U); +} + +/* Intrinsics v[add,sub,mul,div]ph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_ph (__m128h __A, __m128h __B) +{ + return (__m128h) ((__v8hf) __A + (__v8hf) __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_add_ph (__m256h __A, __m256h __B) +{ + return (__m256h) ((__v16hf) __A + (__v16hf) __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_addph128_mask (__C, __D, __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_add_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D) +{ + return __builtin_ia32_addph256_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_ph (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_addph128_mask (__B, __C, _mm_setzero_ph (), + __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_add_ph (__mmask16 __A, __m256h __B, __m256h __C) +{ + return __builtin_ia32_addph256_mask (__B, __C, + _mm256_setzero_ph (), __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_ph (__m128h __A, __m128h __B) +{ + return (__m128h) ((__v8hf) __A - (__v8hf) __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sub_ph (__m256h __A, __m256h __B) +{ + return (__m256h) ((__v16hf) __A - (__v16hf) __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_subph128_mask (__C, __D, __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sub_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D) +{ + return __builtin_ia32_subph256_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_ph (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_subph128_mask (__B, __C, _mm_setzero_ph (), + __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sub_ph (__mmask16 __A, __m256h __B, __m256h __C) +{ + return __builtin_ia32_subph256_mask (__B, __C, + _mm256_setzero_ph (), __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_ph (__m128h __A, __m128h __B) +{ + return (__m128h) ((__v8hf) __A * (__v8hf) __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mul_ph (__m256h __A, __m256h __B) +{ + return (__m256h) ((__v16hf) __A * (__v16hf) __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mul_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_mulph128_mask (__C, __D, __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mul_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D) +{ + return __builtin_ia32_mulph256_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mul_ph (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_mulph128_mask (__B, __C, _mm_setzero_ph (), + __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mul_ph (__mmask16 __A, __m256h __B, __m256h __C) +{ + return __builtin_ia32_mulph256_mask (__B, __C, + _mm256_setzero_ph (), __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_div_ph (__m128h __A, __m128h __B) +{ + return (__m128h) ((__v8hf) __A / (__v8hf) __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_div_ph (__m256h __A, __m256h __B) +{ + return (__m256h) ((__v16hf) __A / (__v16hf) __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_div_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_divph128_mask (__C, __D, __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_div_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D) +{ + return __builtin_ia32_divph256_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_div_ph (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_divph128_mask (__B, __C, _mm_setzero_ph (), + __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_div_ph (__mmask16 __A, __m256h __B, __m256h __C) +{ + return __builtin_ia32_divph256_mask (__B, __C, + _mm256_setzero_ph (), __A); +} + +/* Intrinsics v[max,min]ph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_ph (__m128h __A, __m128h __B) +{ + return __builtin_ia32_maxph128_mask (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_ph (__m256h __A, __m256h __B) +{ + return __builtin_ia32_maxph256_mask (__A, __B, + _mm256_setzero_ph (), + (__mmask16) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_maxph128_mask (__C, __D, __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_max_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D) +{ + return __builtin_ia32_maxph256_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_ph (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_maxph128_mask (__B, __C, _mm_setzero_ph (), + __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_max_ph (__mmask16 __A, __m256h __B, __m256h __C) +{ + return __builtin_ia32_maxph256_mask (__B, __C, + _mm256_setzero_ph (), __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_ph (__m128h __A, __m128h __B) +{ + return __builtin_ia32_minph128_mask (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_ph (__m256h __A, __m256h __B) +{ + return __builtin_ia32_minph256_mask (__A, __B, + _mm256_setzero_ph (), + (__mmask16) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_minph128_mask (__C, __D, __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_min_ph (__m256h __A, __mmask16 __B, __m256h __C, __m256h __D) +{ + return __builtin_ia32_minph256_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_ph (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_minph128_mask (__B, __C, _mm_setzero_ph (), + __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_min_ph (__mmask16 __A, __m256h __B, __m256h __C) +{ + return __builtin_ia32_minph256_mask (__B, __C, + _mm256_setzero_ph (), __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_ph (__m128h __A) +{ + return (__m128h) _mm_and_si128 ( _mm_set1_epi32 (0x7FFF7FFF), + (__m128i) __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_abs_ph (__m256h __A) +{ + return (__m256h) _mm256_and_si256 ( _mm256_set1_epi32 (0x7FFF7FFF), + (__m256i) __A); +} + +/* vcmpph */ +#ifdef __OPTIMIZE +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_ph_mask (__m128h __A, __m128h __B, const int __C) +{ + return (__mmask8) __builtin_ia32_cmpph128_mask (__A, __B, __C, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_ph_mask (__mmask8 __A, __m128h __B, __m128h __C, + const int __D) +{ + return (__mmask8) __builtin_ia32_cmpph128_mask (__B, __C, __D, __A); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_ph_mask (__m256h __A, __m256h __B, const int __C) +{ + return (__mmask16) __builtin_ia32_cmpph256_mask (__A, __B, __C, + (__mmask16) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_ph_mask (__mmask16 __A, __m256h __B, __m256h __C, + const int __D) +{ + return (__mmask16) __builtin_ia32_cmpph256_mask (__B, __C, __D, + __A); +} + +#else +#define _mm_cmp_ph_mask(A, B, C) \ + (__builtin_ia32_cmpph128_mask ((A), (B), (C), (-1))) + +#define _mm_mask_cmp_ph_mask(A, B, C, D) \ + (__builtin_ia32_cmpph128_mask ((B), (C), (D), (A))) + +#define _mm256_cmp_ph_mask(A, B, C) \ + (__builtin_ia32_cmpph256_mask ((A), (B), (C), (-1))) + +#define _mm256_mask_cmp_ph_mask(A, B, C, D) \ + (__builtin_ia32_cmpph256_mask ((B), (C), (D), (A))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vsqrtph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sqrt_ph (__m128h __A) +{ + return __builtin_ia32_sqrtph128_mask (__A, _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sqrt_ph (__m256h __A) +{ + return __builtin_ia32_sqrtph256_mask (__A, _mm256_setzero_ph (), + (__mmask16) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sqrt_ph (__m128h __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_sqrtph128_mask (__C, __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sqrt_ph (__m256h __A, __mmask16 __B, __m256h __C) +{ + return __builtin_ia32_sqrtph256_mask (__C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sqrt_ph (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_sqrtph128_mask (__B, _mm_setzero_ph (), + __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sqrt_ph (__mmask16 __A, __m256h __B) +{ + return __builtin_ia32_sqrtph256_mask (__B, _mm256_setzero_ph (), + __A); +} + +/* Intrinsics vrsqrtph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rsqrt_ph (__m128h __A) +{ + return __builtin_ia32_rsqrtph128_mask (__A, _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rsqrt_ph (__m256h __A) +{ + return __builtin_ia32_rsqrtph256_mask (__A, _mm256_setzero_ph (), + (__mmask16) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rsqrt_ph (__m128h __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_rsqrtph128_mask (__C, __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rsqrt_ph (__m256h __A, __mmask16 __B, __m256h __C) +{ + return __builtin_ia32_rsqrtph256_mask (__C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rsqrt_ph (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_rsqrtph128_mask (__B, _mm_setzero_ph (), __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_rsqrt_ph (__mmask16 __A, __m256h __B) +{ + return __builtin_ia32_rsqrtph256_mask (__B, _mm256_setzero_ph (), + __A); +} + +/* Intrinsics vrcpph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rcp_ph (__m128h __A) +{ + return __builtin_ia32_rcpph128_mask (__A, _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rcp_ph (__m256h __A) +{ + return __builtin_ia32_rcpph256_mask (__A, _mm256_setzero_ph (), + (__mmask16) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rcp_ph (__m128h __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_rcpph128_mask (__C, __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rcp_ph (__m256h __A, __mmask16 __B, __m256h __C) +{ + return __builtin_ia32_rcpph256_mask (__C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rcp_ph (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_rcpph128_mask (__B, _mm_setzero_ph (), __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_rcp_ph (__mmask16 __A, __m256h __B) +{ + return __builtin_ia32_rcpph256_mask (__B, _mm256_setzero_ph (), + __A); +} + +/* Intrinsics vscalefph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_scalef_ph (__m128h __A, __m128h __B) +{ + return __builtin_ia32_scalefph128_mask (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_scalef_ph (__m256h __A, __m256h __B) +{ + return __builtin_ia32_scalefph256_mask (__A, __B, + _mm256_setzero_ph (), + (__mmask16) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_scalef_ph (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return __builtin_ia32_scalefph128_mask (__C, __D, __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_scalef_ph (__m256h __A, __mmask16 __B, __m256h __C, + __m256h __D) +{ + return __builtin_ia32_scalefph256_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_scalef_ph (__mmask8 __A, __m128h __B, __m128h __C) +{ + return __builtin_ia32_scalefph128_mask (__B, __C, + _mm_setzero_ph (), __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_scalef_ph (__mmask16 __A, __m256h __B, __m256h __C) +{ + return __builtin_ia32_scalefph256_mask (__B, __C, + _mm256_setzero_ph (), + __A); +} + +/* Intrinsics vreduceph. */ +#ifdef __OPTIMIZE__ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_ph (__m128h __A, int __B) +{ + return __builtin_ia32_reduceph128_mask (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_ph (__m128h __A, __mmask8 __B, __m128h __C, int __D) +{ + return __builtin_ia32_reduceph128_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_reduce_ph (__mmask8 __A, __m128h __B, int __C) +{ + return __builtin_ia32_reduceph128_mask (__B, __C, + _mm_setzero_ph (), __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reduce_ph (__m256h __A, int __B) +{ + return __builtin_ia32_reduceph256_mask (__A, __B, + _mm256_setzero_ph (), + (__mmask16) -1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_reduce_ph (__m256h __A, __mmask16 __B, __m256h __C, int __D) +{ + return __builtin_ia32_reduceph256_mask (__C, __D, __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_reduce_ph (__mmask16 __A, __m256h __B, int __C) +{ + return __builtin_ia32_reduceph256_mask (__B, __C, + _mm256_setzero_ph (), + __A); +} + +#else +#define _mm_reduce_ph(A, B) \ + (__builtin_ia32_reduceph128_mask ((A), (B), \ + _mm_setzero_ph (), \ + ((__mmask8)-1))) + +#define _mm_mask_reduce_ph(A, B, C, D) \ + (__builtin_ia32_reduceph128_mask ((C), (D), (A), (B))) + +#define _mm_maskz_reduce_ph(A, B, C) \ + (__builtin_ia32_reduceph128_mask ((B), (C), _mm_setzero_ph (), (A))) + +#define _mm256_reduce_ph(A, B) \ + (__builtin_ia32_reduceph256_mask ((A), (B), \ + _mm256_setzero_ph (), \ + ((__mmask16)-1))) + +#define _mm256_mask_reduce_ph(A, B, C, D) \ + (__builtin_ia32_reduceph256_mask ((C), (D), (A), (B))) + +#define _mm256_maskz_reduce_ph(A, B, C) \ + (__builtin_ia32_reduceph256_mask ((B), (C), _mm256_setzero_ph (), (A))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vrndscaleph. */ +#ifdef __OPTIMIZE__ + extern __inline __m128h + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) + _mm_roundscale_ph (__m128h __A, int __B) + { + return __builtin_ia32_rndscaleph128_mask (__A, __B, + _mm_setzero_ph (), + (__mmask8) -1); + } + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_roundscale_ph (__m128h __A, __mmask8 __B, __m128h __C, int __D) +{ + return __builtin_ia32_rndscaleph128_mask (__C, __D, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_roundscale_ph (__mmask8 __A, __m128h __B, int __C) +{ + return __builtin_ia32_rndscaleph128_mask (__B, __C, + _mm_setzero_ph (), __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_roundscale_ph (__m256h __A, int __B) +{ + return __builtin_ia32_rndscaleph256_mask (__A, __B, + _mm256_setzero_ph (), + (__mmask16) -1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_roundscale_ph (__m256h __A, __mmask16 __B, __m256h __C, + int __D) +{ + return __builtin_ia32_rndscaleph256_mask (__C, __D, __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_roundscale_ph (__mmask16 __A, __m256h __B, int __C) +{ + return __builtin_ia32_rndscaleph256_mask (__B, __C, + _mm256_setzero_ph (), + __A); +} + +#else +#define _mm_roundscale_ph(A, B) \ + (__builtin_ia32_rndscaleph128_mask ((A), (B), _mm_setzero_ph (), \ + ((__mmask8)-1))) + +#define _mm_mask_roundscale_ph(A, B, C, D) \ + (__builtin_ia32_rndscaleph128_mask ((C), (D), (A), (B))) + +#define _mm_maskz_roundscale_ph(A, B, C) \ + (__builtin_ia32_rndscaleph128_mask ((B), (C), _mm_setzero_ph (), (A))) + +#define _mm256_roundscale_ph(A, B) \ + (__builtin_ia32_rndscaleph256_mask ((A), (B), \ + _mm256_setzero_ph(), \ + ((__mmask16)-1))) + +#define _mm256_mask_roundscale_ph(A, B, C, D) \ + (__builtin_ia32_rndscaleph256_mask ((C), (D), (A), (B))) + +#define _mm256_maskz_roundscale_ph(A, B, C) \ + (__builtin_ia32_rndscaleph256_mask ((B), (C), \ + _mm256_setzero_ph (), (A))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vfpclassph. */ +#ifdef __OPTIMIZE__ +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) + _mm_mask_fpclass_ph_mask (__mmask8 __U, __m128h __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclassph128_mask ((__v8hf) __A, + __imm, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fpclass_ph_mask (__m128h __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclassph128_mask ((__v8hf) __A, + __imm, + (__mmask8) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fpclass_ph_mask (__mmask16 __U, __m256h __A, const int __imm) +{ + return (__mmask16) __builtin_ia32_fpclassph256_mask ((__v16hf) __A, + __imm, __U); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fpclass_ph_mask (__m256h __A, const int __imm) +{ + return (__mmask16) __builtin_ia32_fpclassph256_mask ((__v16hf) __A, + __imm, + (__mmask16) -1); +} + +#else +#define _mm_fpclass_ph_mask(X, C) \ + ((__mmask8) __builtin_ia32_fpclassph128_mask ((__v8hf) (__m128h) (X), \ + (int) (C),(__mmask8)-1)) + +#define _mm_mask_fpclass_ph_mask(u, X, C) \ + ((__mmask8) __builtin_ia32_fpclassph128_mask ((__v8hf) (__m128h) (X), \ + (int) (C),(__mmask8)(u))) + +#define _mm256_fpclass_ph_mask(X, C) \ + ((__mmask16) __builtin_ia32_fpclassph256_mask ((__v16hf) (__m256h) (X), \ + (int) (C),(__mmask16)-1)) + +#define _mm256_mask_fpclass_ph_mask(u, X, C) \ + ((__mmask16) __builtin_ia32_fpclassph256_mask ((__v16hf) (__m256h) (X), \ + (int) (C),(__mmask16)(u))) +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vgetexpph, vgetexpsh. */ +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_getexp_ph (__m256h __A) +{ + return (__m256h) __builtin_ia32_getexpph256_mask ((__v16hf) __A, + (__v16hf) + _mm256_setzero_ph (), + (__mmask16) -1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_getexp_ph (__m256h __W, __mmask16 __U, __m256h __A) +{ + return (__m256h) __builtin_ia32_getexpph256_mask ((__v16hf) __A, + (__v16hf) __W, + (__mmask16) __U); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_getexp_ph (__mmask16 __U, __m256h __A) +{ + return (__m256h) __builtin_ia32_getexpph256_mask ((__v16hf) __A, + (__v16hf) + _mm256_setzero_ph (), + (__mmask16) __U); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getexp_ph (__m128h __A) +{ + return (__m128h) __builtin_ia32_getexpph128_mask ((__v8hf) __A, + (__v8hf) + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getexp_ph (__m128h __W, __mmask8 __U, __m128h __A) +{ + return (__m128h) __builtin_ia32_getexpph128_mask ((__v8hf) __A, + (__v8hf) __W, + (__mmask8) __U); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getexp_ph (__mmask8 __U, __m128h __A) +{ + return (__m128h) __builtin_ia32_getexpph128_mask ((__v8hf) __A, + (__v8hf) + _mm_setzero_ph (), + (__mmask8) __U); +} + + +/* Intrinsics vgetmantph, vgetmantsh. */ +#ifdef __OPTIMIZE__ +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_getmant_ph (__m256h __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m256h) __builtin_ia32_getmantph256_mask ((__v16hf) __A, + (__C << 2) | __B, + (__v16hf) + _mm256_setzero_ph (), + (__mmask16) -1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_getmant_ph (__m256h __W, __mmask16 __U, __m256h __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m256h) __builtin_ia32_getmantph256_mask ((__v16hf) __A, + (__C << 2) | __B, + (__v16hf) __W, + (__mmask16) __U); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_getmant_ph (__mmask16 __U, __m256h __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m256h) __builtin_ia32_getmantph256_mask ((__v16hf) __A, + (__C << 2) | __B, + (__v16hf) + _mm256_setzero_ph (), + (__mmask16) __U); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getmant_ph (__m128h __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m128h) __builtin_ia32_getmantph128_mask ((__v8hf) __A, + (__C << 2) | __B, + (__v8hf) + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getmant_ph (__m128h __W, __mmask8 __U, __m128h __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m128h) __builtin_ia32_getmantph128_mask ((__v8hf) __A, + (__C << 2) | __B, + (__v8hf) __W, + (__mmask8) __U); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getmant_ph (__mmask8 __U, __m128h __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m128h) __builtin_ia32_getmantph128_mask ((__v8hf) __A, + (__C << 2) | __B, + (__v8hf) + _mm_setzero_ph (), + (__mmask8) __U); +} + +#else +#define _mm256_getmant_ph(X, B, C) \ + ((__m256h) __builtin_ia32_getmantph256_mask ((__v16hf)(__m256h) (X), \ + (int)(((C)<<2) | (B)), \ + (__v16hf)(__m256h)_mm256_setzero_ph (), \ + (__mmask16)-1)) + +#define _mm256_mask_getmant_ph(W, U, X, B, C) \ + ((__m256h) __builtin_ia32_getmantph256_mask ((__v16hf)(__m256h) (X), \ + (int)(((C)<<2) | (B)), \ + (__v16hf)(__m256h)(W), \ + (__mmask16)(U))) + +#define _mm256_maskz_getmant_ph(U, X, B, C) \ + ((__m256h) __builtin_ia32_getmantph256_mask ((__v16hf)(__m256h) (X), \ + (int)(((C)<<2) | (B)), \ + (__v16hf)(__m256h)_mm256_setzero_ph (), \ + (__mmask16)(U))) + +#define _mm_getmant_ph(X, B, C) \ + ((__m128h) __builtin_ia32_getmantph128_mask ((__v8hf)(__m128h) (X), \ + (int)(((C)<<2) | (B)), \ + (__v8hf)(__m128h)_mm_setzero_ph (), \ + (__mmask8)-1)) + +#define _mm_mask_getmant_ph(W, U, X, B, C) \ + ((__m128h) __builtin_ia32_getmantph128_mask ((__v8hf)(__m128h) (X), \ + (int)(((C)<<2) | (B)), \ + (__v8hf)(__m128h)(W), \ + (__mmask8)(U))) + +#define _mm_maskz_getmant_ph(U, X, B, C) \ + ((__m128h) __builtin_ia32_getmantph128_mask ((__v8hf)(__m128h) (X), \ + (int)(((C)<<2) | (B)), \ + (__v8hf)(__m128h)_mm_setzero_ph (), \ + (__mmask8)(U))) + +#endif /* __OPTIMIZE__ */ + +/* Intrinsics vcvtph2dq. */ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtph_epi32 (__m128h __A) +{ + return (__m128i) + __builtin_ia32_vcvtph2dq128_mask (__A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtph_epi32 (__m128i __A, __mmask8 __B, __m128h __C) +{ + return (__m128i) + __builtin_ia32_vcvtph2dq128_mask (__C, ( __v4si) __A, __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtph_epi32 (__mmask8 __A, __m128h __B) +{ + return (__m128i) + __builtin_ia32_vcvtph2dq128_mask (__B, + (__v4si) _mm_setzero_si128 (), + __A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtph_epi32 (__m128h __A) +{ + return (__m256i) + __builtin_ia32_vcvtph2dq256_mask (__A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtph_epi32 (__m256i __A, __mmask8 __B, __m128h __C) +{ + return (__m256i) + __builtin_ia32_vcvtph2dq256_mask (__C, ( __v8si) __A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtph_epi32 (__mmask8 __A, __m128h __B) +{ + return (__m256i) + __builtin_ia32_vcvtph2dq256_mask (__B, + (__v8si) + _mm256_setzero_si256 (), + __A); +} + +/* Intrinsics vcvtph2udq. */ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtph_epu32 (__m128h __A) +{ + return (__m128i) + __builtin_ia32_vcvtph2udq128_mask (__A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtph_epu32 (__m128i __A, __mmask8 __B, __m128h __C) +{ + return (__m128i) + __builtin_ia32_vcvtph2udq128_mask (__C, ( __v4si) __A, __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtph_epu32 (__mmask8 __A, __m128h __B) +{ + return (__m128i) + __builtin_ia32_vcvtph2udq128_mask (__B, + (__v4si) + _mm_setzero_si128 (), + __A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtph_epu32 (__m128h __A) +{ + return (__m256i) + __builtin_ia32_vcvtph2udq256_mask (__A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtph_epu32 (__m256i __A, __mmask8 __B, __m128h __C) +{ + return (__m256i) + __builtin_ia32_vcvtph2udq256_mask (__C, ( __v8si) __A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtph_epu32 (__mmask8 __A, __m128h __B) +{ + return (__m256i) + __builtin_ia32_vcvtph2udq256_mask (__B, + (__v8si) _mm256_setzero_si256 (), + __A); +} + +/* Intrinsics vcvttph2dq. */ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttph_epi32 (__m128h __A) +{ + return (__m128i) + __builtin_ia32_vcvttph2dq128_mask (__A, + (__v4si) _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvttph_epi32 (__m128i __A, __mmask8 __B, __m128h __C) +{ + return (__m128i)__builtin_ia32_vcvttph2dq128_mask (__C, + ( __v4si) __A, + __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvttph_epi32 (__mmask8 __A, __m128h __B) +{ + return (__m128i) + __builtin_ia32_vcvttph2dq128_mask (__B, + (__v4si) _mm_setzero_si128 (), + __A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvttph_epi32 (__m128h __A) +{ + return (__m256i) + __builtin_ia32_vcvttph2dq256_mask (__A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvttph_epi32 (__m256i __A, __mmask8 __B, __m128h __C) +{ + return (__m256i) + __builtin_ia32_vcvttph2dq256_mask (__C, + ( __v8si) __A, + __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvttph_epi32 (__mmask8 __A, __m128h __B) +{ + return (__m256i) + __builtin_ia32_vcvttph2dq256_mask (__B, + (__v8si) + _mm256_setzero_si256 (), + __A); +} + +/* Intrinsics vcvttph2udq. */ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttph_epu32 (__m128h __A) +{ + return (__m128i) + __builtin_ia32_vcvttph2udq128_mask (__A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvttph_epu32 (__m128i __A, __mmask8 __B, __m128h __C) +{ + return (__m128i) + __builtin_ia32_vcvttph2udq128_mask (__C, + ( __v4si) __A, + __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvttph_epu32 (__mmask8 __A, __m128h __B) +{ + return (__m128i) + __builtin_ia32_vcvttph2udq128_mask (__B, + (__v4si) + _mm_setzero_si128 (), + __A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvttph_epu32 (__m128h __A) +{ + return (__m256i) + __builtin_ia32_vcvttph2udq256_mask (__A, + (__v8si) + _mm256_setzero_si256 (), (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvttph_epu32 (__m256i __A, __mmask8 __B, __m128h __C) +{ + return (__m256i) + __builtin_ia32_vcvttph2udq256_mask (__C, + ( __v8si) __A, + __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvttph_epu32 (__mmask8 __A, __m128h __B) +{ + return (__m256i) + __builtin_ia32_vcvttph2udq256_mask (__B, + (__v8si) + _mm256_setzero_si256 (), + __A); +} + +/* Intrinsics vcvtdq2ph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi32_ph (__m128i __A) +{ + return __builtin_ia32_vcvtdq2ph128_mask ((__v4si) __A, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi32_ph (__m128h __A, __mmask8 __B, __m128i __C) +{ + return __builtin_ia32_vcvtdq2ph128_mask ((__v4si) __C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi32_ph (__mmask8 __A, __m128i __B) +{ + return __builtin_ia32_vcvtdq2ph128_mask ((__v4si) __B, + _mm_setzero_ph (), + __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi32_ph (__m256i __A) +{ + return __builtin_ia32_vcvtdq2ph256_mask ((__v8si) __A, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi32_ph (__m128h __A, __mmask8 __B, __m256i __C) +{ + return __builtin_ia32_vcvtdq2ph256_mask ((__v8si) __C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi32_ph (__mmask8 __A, __m256i __B) +{ + return __builtin_ia32_vcvtdq2ph256_mask ((__v8si) __B, + _mm_setzero_ph (), + __A); +} + +/* Intrinsics vcvtudq2ph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu32_ph (__m128i __A) +{ + return __builtin_ia32_vcvtudq2ph128_mask ((__v4si) __A, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepu32_ph (__m128h __A, __mmask8 __B, __m128i __C) +{ + return __builtin_ia32_vcvtudq2ph128_mask ((__v4si) __C, + __A, + __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepu32_ph (__mmask8 __A, __m128i __B) +{ + return __builtin_ia32_vcvtudq2ph128_mask ((__v4si) __B, + _mm_setzero_ph (), + __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu32_ph (__m256i __A) +{ + return __builtin_ia32_vcvtudq2ph256_mask ((__v8si) __A, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepu32_ph (__m128h __A, __mmask8 __B, __m256i __C) +{ + return __builtin_ia32_vcvtudq2ph256_mask ((__v8si) __C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepu32_ph (__mmask8 __A, __m256i __B) +{ + return __builtin_ia32_vcvtudq2ph256_mask ((__v8si) __B, + _mm_setzero_ph (), + __A); +} + +/* Intrinsics vcvtph2qq. */ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtph_epi64 (__m128h __A) +{ + return + __builtin_ia32_vcvtph2qq128_mask (__A, + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtph_epi64 (__m128i __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvtph2qq128_mask (__C, __A, __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtph_epi64 (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvtph2qq128_mask (__B, + _mm_setzero_si128 (), + __A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtph_epi64 (__m128h __A) +{ + return __builtin_ia32_vcvtph2qq256_mask (__A, + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtph_epi64 (__m256i __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvtph2qq256_mask (__C, __A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtph_epi64 (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvtph2qq256_mask (__B, + _mm256_setzero_si256 (), + __A); +} + +/* Intrinsics vcvtph2uqq. */ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtph_epu64 (__m128h __A) +{ + return __builtin_ia32_vcvtph2uqq128_mask (__A, + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtph_epu64 (__m128i __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvtph2uqq128_mask (__C, __A, __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtph_epu64 (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvtph2uqq128_mask (__B, + _mm_setzero_si128 (), + __A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtph_epu64 (__m128h __A) +{ + return __builtin_ia32_vcvtph2uqq256_mask (__A, + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtph_epu64 (__m256i __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvtph2uqq256_mask (__C, __A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtph_epu64 (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvtph2uqq256_mask (__B, + _mm256_setzero_si256 (), + __A); +} + +/* Intrinsics vcvttph2qq. */ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttph_epi64 (__m128h __A) +{ + return __builtin_ia32_vcvttph2qq128_mask (__A, + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvttph_epi64 (__m128i __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvttph2qq128_mask (__C, + __A, + __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvttph_epi64 (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvttph2qq128_mask (__B, + _mm_setzero_si128 (), + __A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvttph_epi64 (__m128h __A) +{ + return __builtin_ia32_vcvttph2qq256_mask (__A, + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvttph_epi64 (__m256i __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvttph2qq256_mask (__C, + __A, + __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvttph_epi64 (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvttph2qq256_mask (__B, + _mm256_setzero_si256 (), + __A); +} + +/* Intrinsics vcvttph2uqq. */ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttph_epu64 (__m128h __A) +{ + return __builtin_ia32_vcvttph2uqq128_mask (__A, + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvttph_epu64 (__m128i __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvttph2uqq128_mask (__C, + __A, + __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvttph_epu64 (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvttph2uqq128_mask (__B, + _mm_setzero_si128 (), + __A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvttph_epu64 (__m128h __A) +{ + return __builtin_ia32_vcvttph2uqq256_mask (__A, + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvttph_epu64 (__m256i __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvttph2uqq256_mask (__C, + __A, + __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvttph_epu64 (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvttph2uqq256_mask (__B, + _mm256_setzero_si256 (), + __A); +} + +/* Intrinsics vcvtqq2ph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi64_ph (__m128i __A) +{ + return __builtin_ia32_vcvtqq2ph128_mask ((__v2di) __A, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi64_ph (__m128h __A, __mmask8 __B, __m128i __C) +{ + return __builtin_ia32_vcvtqq2ph128_mask ((__v2di) __C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi64_ph (__mmask8 __A, __m128i __B) +{ + return __builtin_ia32_vcvtqq2ph128_mask ((__v2di) __B, + _mm_setzero_ph (), + __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi64_ph (__m256i __A) +{ + return __builtin_ia32_vcvtqq2ph256_mask ((__v4di) __A, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi64_ph (__m128h __A, __mmask8 __B, __m256i __C) +{ + return __builtin_ia32_vcvtqq2ph256_mask ((__v4di) __C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi64_ph (__mmask8 __A, __m256i __B) +{ + return __builtin_ia32_vcvtqq2ph256_mask ((__v4di) __B, + _mm_setzero_ph (), + __A); +} + +/* Intrinsics vcvtuqq2ph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu64_ph (__m128i __A) +{ + return __builtin_ia32_vcvtuqq2ph128_mask ((__v2di) __A, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepu64_ph (__m128h __A, __mmask8 __B, __m128i __C) +{ + return __builtin_ia32_vcvtuqq2ph128_mask ((__v2di) __C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepu64_ph (__mmask8 __A, __m128i __B) +{ + return __builtin_ia32_vcvtuqq2ph128_mask ((__v2di) __B, + _mm_setzero_ph (), + __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu64_ph (__m256i __A) +{ + return __builtin_ia32_vcvtuqq2ph256_mask ((__v4di) __A, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepu64_ph (__m128h __A, __mmask8 __B, __m256i __C) +{ + return __builtin_ia32_vcvtuqq2ph256_mask ((__v4di) __C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepu64_ph (__mmask8 __A, __m256i __B) +{ + return __builtin_ia32_vcvtuqq2ph256_mask ((__v4di) __B, + _mm_setzero_ph (), + __A); +} + +/* Intrinsics vcvtph2w. */ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtph_epi16 (__m128h __A) +{ + return (__m128i) + __builtin_ia32_vcvtph2w128_mask (__A, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtph_epi16 (__m128i __A, __mmask8 __B, __m128h __C) +{ + return (__m128i) + __builtin_ia32_vcvtph2w128_mask (__C, ( __v8hi) __A, __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtph_epi16 (__mmask8 __A, __m128h __B) +{ + return (__m128i) + __builtin_ia32_vcvtph2w128_mask (__B, + (__v8hi) + _mm_setzero_si128 (), + __A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtph_epi16 (__m256h __A) +{ + return (__m256i) + __builtin_ia32_vcvtph2w256_mask (__A, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtph_epi16 (__m256i __A, __mmask16 __B, __m256h __C) +{ + return (__m256i) + __builtin_ia32_vcvtph2w256_mask (__C, ( __v16hi) __A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtph_epi16 (__mmask16 __A, __m256h __B) +{ + return (__m256i) + __builtin_ia32_vcvtph2w256_mask (__B, + (__v16hi) + _mm256_setzero_si256 (), + __A); +} + +/* Intrinsics vcvtph2uw. */ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtph_epu16 (__m128h __A) +{ + return (__m128i) + __builtin_ia32_vcvtph2uw128_mask (__A, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtph_epu16 (__m128i __A, __mmask8 __B, __m128h __C) +{ + return (__m128i) + __builtin_ia32_vcvtph2uw128_mask (__C, ( __v8hi) __A, __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtph_epu16 (__mmask8 __A, __m128h __B) +{ + return (__m128i) + __builtin_ia32_vcvtph2uw128_mask (__B, + (__v8hi) + _mm_setzero_si128 (), + __A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtph_epu16 (__m256h __A) +{ + return (__m256i) + __builtin_ia32_vcvtph2uw256_mask (__A, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtph_epu16 (__m256i __A, __mmask16 __B, __m256h __C) +{ + return (__m256i) + __builtin_ia32_vcvtph2uw256_mask (__C, ( __v16hi) __A, __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtph_epu16 (__mmask16 __A, __m256h __B) +{ + return (__m256i) + __builtin_ia32_vcvtph2uw256_mask (__B, + (__v16hi) + _mm256_setzero_si256 (), + __A); +} + +/* Intrinsics vcvttph2w. */ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttph_epi16 (__m128h __A) +{ + return (__m128i) + __builtin_ia32_vcvttph2w128_mask (__A, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvttph_epi16 (__m128i __A, __mmask8 __B, __m128h __C) +{ + return (__m128i) + __builtin_ia32_vcvttph2w128_mask (__C, + ( __v8hi) __A, + __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvttph_epi16 (__mmask8 __A, __m128h __B) +{ + return (__m128i) + __builtin_ia32_vcvttph2w128_mask (__B, + (__v8hi) + _mm_setzero_si128 (), + __A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvttph_epi16 (__m256h __A) +{ + return (__m256i) + __builtin_ia32_vcvttph2w256_mask (__A, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvttph_epi16 (__m256i __A, __mmask16 __B, __m256h __C) +{ + return (__m256i) + __builtin_ia32_vcvttph2w256_mask (__C, + ( __v16hi) __A, + __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvttph_epi16 (__mmask16 __A, __m256h __B) +{ + return (__m256i) + __builtin_ia32_vcvttph2w256_mask (__B, + (__v16hi) + _mm256_setzero_si256 (), + __A); +} + +/* Intrinsics vcvttph2uw. */ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttph_epu16 (__m128h __A) +{ + return (__m128i) + __builtin_ia32_vcvttph2uw128_mask (__A, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvttph_epu16 (__m128i __A, __mmask8 __B, __m128h __C) +{ + return (__m128i) + __builtin_ia32_vcvttph2uw128_mask (__C, + ( __v8hi) __A, + __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvttph_epu16 (__mmask8 __A, __m128h __B) +{ + return (__m128i) + __builtin_ia32_vcvttph2uw128_mask (__B, + (__v8hi) + _mm_setzero_si128 (), + __A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvttph_epu16 (__m256h __A) +{ + return (__m256i) + __builtin_ia32_vcvttph2uw256_mask (__A, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvttph_epu16 (__m256i __A, __mmask16 __B, __m256h __C) +{ + return (__m256i) + __builtin_ia32_vcvttph2uw256_mask (__C, + ( __v16hi) __A, + __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvttph_epu16 (__mmask16 __A, __m256h __B) +{ + return (__m256i) + __builtin_ia32_vcvttph2uw256_mask (__B, + (__v16hi) _mm256_setzero_si256 (), + __A); +} + +/* Intrinsics vcvtw2ph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi16_ph (__m128i __A) +{ + return __builtin_ia32_vcvtw2ph128_mask ((__v8hi) __A, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi16_ph (__m128h __A, __mmask8 __B, __m128i __C) +{ + return __builtin_ia32_vcvtw2ph128_mask ((__v8hi) __C, + __A, + __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi16_ph (__mmask8 __A, __m128i __B) +{ + return __builtin_ia32_vcvtw2ph128_mask ((__v8hi) __B, + _mm_setzero_ph (), + __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi16_ph (__m256i __A) +{ + return __builtin_ia32_vcvtw2ph256_mask ((__v16hi) __A, + _mm256_setzero_ph (), + (__mmask16) -1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi16_ph (__m256h __A, __mmask16 __B, __m256i __C) +{ + return __builtin_ia32_vcvtw2ph256_mask ((__v16hi) __C, + __A, + __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi16_ph (__mmask16 __A, __m256i __B) +{ + return __builtin_ia32_vcvtw2ph256_mask ((__v16hi) __B, + _mm256_setzero_ph (), + __A); +} + +/* Intrinsics vcvtuw2ph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu16_ph (__m128i __A) +{ + return __builtin_ia32_vcvtuw2ph128_mask ((__v8hi) __A, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepu16_ph (__m128h __A, __mmask8 __B, __m128i __C) +{ + return __builtin_ia32_vcvtuw2ph128_mask ((__v8hi) __C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepu16_ph (__mmask8 __A, __m128i __B) +{ + return __builtin_ia32_vcvtuw2ph128_mask ((__v8hi) __B, + _mm_setzero_ph (), + __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu16_ph (__m256i __A) +{ + return __builtin_ia32_vcvtuw2ph256_mask ((__v16hi) __A, + _mm256_setzero_ph (), + (__mmask16) -1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepu16_ph (__m256h __A, __mmask16 __B, __m256i __C) +{ + return __builtin_ia32_vcvtuw2ph256_mask ((__v16hi) __C, __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepu16_ph (__mmask16 __A, __m256i __B) +{ + return __builtin_ia32_vcvtuw2ph256_mask ((__v16hi) __B, + _mm256_setzero_ph (), + __A); +} + +/* Intrinsics vcvtph2pd. */ +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtph_pd (__m128h __A) +{ + return __builtin_ia32_vcvtph2pd128_mask (__A, + _mm_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtph_pd (__m128d __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvtph2pd128_mask (__C, __A, __B); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtph_pd (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvtph2pd128_mask (__B, _mm_setzero_pd (), __A); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtph_pd (__m128h __A) +{ + return __builtin_ia32_vcvtph2pd256_mask (__A, + _mm256_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtph_pd (__m256d __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvtph2pd256_mask (__C, __A, __B); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtph_pd (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvtph2pd256_mask (__B, + _mm256_setzero_pd (), + __A); +} + +/* Intrinsics vcvtph2ps. */ +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtxph_ps (__m128h __A) +{ + return __builtin_ia32_vcvtph2psx128_mask (__A, + _mm_setzero_ps (), + (__mmask8) -1); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtxph_ps (__m128 __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvtph2psx128_mask (__C, __A, __B); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtxph_ps (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvtph2psx128_mask (__B, _mm_setzero_ps (), __A); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtxph_ps (__m128h __A) +{ + return __builtin_ia32_vcvtph2psx256_mask (__A, + _mm256_setzero_ps (), + (__mmask8) -1); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtxph_ps (__m256 __A, __mmask8 __B, __m128h __C) +{ + return __builtin_ia32_vcvtph2psx256_mask (__C, __A, __B); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtxph_ps (__mmask8 __A, __m128h __B) +{ + return __builtin_ia32_vcvtph2psx256_mask (__B, + _mm256_setzero_ps (), + __A); +} + +/* Intrinsics vcvtxps2ph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtxps_ph (__m128 __A) +{ + return __builtin_ia32_vcvtps2phx128_mask ((__v4sf) __A, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtxps_ph (__m128h __A, __mmask8 __B, __m128 __C) +{ + return __builtin_ia32_vcvtps2phx128_mask ((__v4sf) __C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtxps_ph (__mmask8 __A, __m128 __B) +{ + return __builtin_ia32_vcvtps2phx128_mask ((__v4sf) __B, + _mm_setzero_ph (), + __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtxps_ph (__m256 __A) +{ + return __builtin_ia32_vcvtps2phx256_mask ((__v8sf) __A, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtxps_ph (__m128h __A, __mmask8 __B, __m256 __C) +{ + return __builtin_ia32_vcvtps2phx256_mask ((__v8sf) __C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtxps_ph (__mmask8 __A, __m256 __B) +{ + return __builtin_ia32_vcvtps2phx256_mask ((__v8sf) __B, + _mm_setzero_ph (), + __A); +} + +/* Intrinsics vcvtpd2ph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpd_ph (__m128d __A) +{ + return __builtin_ia32_vcvtpd2ph128_mask ((__v2df) __A, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtpd_ph (__m128h __A, __mmask8 __B, __m128d __C) +{ + return __builtin_ia32_vcvtpd2ph128_mask ((__v2df) __C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtpd_ph (__mmask8 __A, __m128d __B) +{ + return __builtin_ia32_vcvtpd2ph128_mask ((__v2df) __B, + _mm_setzero_ph (), + __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtpd_ph (__m256d __A) +{ + return __builtin_ia32_vcvtpd2ph256_mask ((__v4df) __A, + _mm_setzero_ph (), + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtpd_ph (__m128h __A, __mmask8 __B, __m256d __C) +{ + return __builtin_ia32_vcvtpd2ph256_mask ((__v4df) __C, __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtpd_ph (__mmask8 __A, __m256d __B) +{ + return __builtin_ia32_vcvtpd2ph256_mask ((__v4df) __B, + _mm_setzero_ph (), + __A); +} + +/* Intrinsics vfmaddsub[132,213,231]ph. */ +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fmaddsub_ph (__m256h __A, __m256h __B, __m256h __C) +{ + return (__m256h)__builtin_ia32_vfmaddsubph256_mask ((__v16hf)__A, + (__v16hf)__B, + (__v16hf)__C, + (__mmask16)-1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fmaddsub_ph (__m256h __A, __mmask16 __U, __m256h __B, + __m256h __C) +{ + return (__m256h) __builtin_ia32_vfmaddsubph256_mask ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C, + (__mmask16) __U); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fmaddsub_ph (__m256h __A, __m256h __B, __m256h __C, + __mmask16 __U) +{ + return (__m256h) __builtin_ia32_vfmaddsubph256_mask3 ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C, + (__mmask16) + __U); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fmaddsub_ph (__mmask16 __U, __m256h __A, __m256h __B, + __m256h __C) +{ + return (__m256h) __builtin_ia32_vfmaddsubph256_maskz ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C, + (__mmask16) + __U); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmaddsub_ph (__m128h __A, __m128h __B, __m128h __C) +{ + return (__m128h)__builtin_ia32_vfmaddsubph128_mask ((__v8hf)__A, + (__v8hf)__B, + (__v8hf)__C, + (__mmask8)-1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmaddsub_ph (__m128h __A, __mmask8 __U, __m128h __B, + __m128h __C) +{ + return (__m128h) __builtin_ia32_vfmaddsubph128_mask ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + (__mmask8) __U); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmaddsub_ph (__m128h __A, __m128h __B, __m128h __C, + __mmask8 __U) +{ + return (__m128h) __builtin_ia32_vfmaddsubph128_mask3 ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + (__mmask8) + __U); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmaddsub_ph (__mmask8 __U, __m128h __A, __m128h __B, + __m128h __C) +{ + return (__m128h) __builtin_ia32_vfmaddsubph128_maskz ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + (__mmask8) + __U); +} + +/* Intrinsics vfmsubadd[132,213,231]ph. */ +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fmsubadd_ph (__m256h __A, __m256h __B, __m256h __C) +{ + return (__m256h) __builtin_ia32_vfmsubaddph256_mask ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C, + (__mmask16) -1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fmsubadd_ph (__m256h __A, __mmask16 __U, __m256h __B, + __m256h __C) +{ + return (__m256h) __builtin_ia32_vfmsubaddph256_mask ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C, + (__mmask16) __U); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fmsubadd_ph (__m256h __A, __m256h __B, __m256h __C, + __mmask16 __U) +{ + return (__m256h) __builtin_ia32_vfmsubaddph256_mask3 ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C, + (__mmask16) + __U); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fmsubadd_ph (__mmask16 __U, __m256h __A, __m256h __B, + __m256h __C) +{ + return (__m256h) __builtin_ia32_vfmsubaddph256_maskz ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C, + (__mmask16) + __U); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmsubadd_ph (__m128h __A, __m128h __B, __m128h __C) +{ + return (__m128h) __builtin_ia32_vfmsubaddph128_mask ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmsubadd_ph (__m128h __A, __mmask8 __U, __m128h __B, + __m128h __C) +{ + return (__m128h) __builtin_ia32_vfmsubaddph128_mask ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + (__mmask8) __U); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmsubadd_ph (__m128h __A, __m128h __B, __m128h __C, + __mmask8 __U) +{ + return (__m128h) __builtin_ia32_vfmsubaddph128_mask3 ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + (__mmask8) + __U); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmsubadd_ph (__mmask8 __U, __m128h __A, __m128h __B, + __m128h __C) +{ + return (__m128h) __builtin_ia32_vfmsubaddph128_maskz ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + (__mmask8) + __U); +} + +/* Intrinsics vfmadd[132,213,231]ph. */ +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fmadd_ph (__m256h __A, __m256h __B, __m256h __C) +{ + return (__m256h) __builtin_ia32_vfmaddph256_mask ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C, + (__mmask16) -1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fmadd_ph (__m256h __A, __mmask16 __U, __m256h __B, + __m256h __C) +{ + return (__m256h) __builtin_ia32_vfmaddph256_mask ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C, + (__mmask16) __U); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fmadd_ph (__m256h __A, __m256h __B, __m256h __C, + __mmask16 __U) +{ + return (__m256h) __builtin_ia32_vfmaddph256_mask3 ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C, + (__mmask16) + __U); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fmadd_ph (__mmask16 __U, __m256h __A, __m256h __B, + __m256h __C) +{ + return (__m256h) __builtin_ia32_vfmaddph256_maskz ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C, + (__mmask16) + __U); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmadd_ph (__m128h __A, __m128h __B, __m128h __C) +{ + return (__m128h) __builtin_ia32_vfmaddph128_mask ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmadd_ph (__m128h __A, __mmask8 __U, __m128h __B, + __m128h __C) +{ + return (__m128h) __builtin_ia32_vfmaddph128_mask ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + (__mmask8) __U); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmadd_ph (__m128h __A, __m128h __B, __m128h __C, + __mmask8 __U) +{ + return (__m128h) __builtin_ia32_vfmaddph128_mask3 ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + (__mmask8) + __U); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmadd_ph (__mmask8 __U, __m128h __A, __m128h __B, + __m128h __C) +{ + return (__m128h) __builtin_ia32_vfmaddph128_maskz ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + (__mmask8) + __U); +} + +/* Intrinsics vfnmadd[132,213,231]ph. */ +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fnmadd_ph (__m256h __A, __m256h __B, __m256h __C) +{ + return (__m256h) __builtin_ia32_vfnmaddph256_mask ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C, + (__mmask16) -1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fnmadd_ph (__m256h __A, __mmask16 __U, __m256h __B, + __m256h __C) +{ + return (__m256h) __builtin_ia32_vfnmaddph256_mask ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C, + (__mmask16) __U); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fnmadd_ph (__m256h __A, __m256h __B, __m256h __C, + __mmask16 __U) +{ + return (__m256h) __builtin_ia32_vfnmaddph256_mask3 ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C, + (__mmask16) + __U); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fnmadd_ph (__mmask16 __U, __m256h __A, __m256h __B, + __m256h __C) +{ + return (__m256h) __builtin_ia32_vfnmaddph256_maskz ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C, + (__mmask16) + __U); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmadd_ph (__m128h __A, __m128h __B, __m128h __C) +{ + return (__m128h) __builtin_ia32_vfnmaddph128_mask ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmadd_ph (__m128h __A, __mmask8 __U, __m128h __B, + __m128h __C) +{ + return (__m128h) __builtin_ia32_vfnmaddph128_mask ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + (__mmask8) __U); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmadd_ph (__m128h __A, __m128h __B, __m128h __C, + __mmask8 __U) +{ + return (__m128h) __builtin_ia32_vfnmaddph128_mask3 ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + (__mmask8) + __U); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmadd_ph (__mmask8 __U, __m128h __A, __m128h __B, + __m128h __C) +{ + return (__m128h) __builtin_ia32_vfnmaddph128_maskz ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + (__mmask8) + __U); +} + +/* Intrinsics vfmsub[132,213,231]ph. */ +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fmsub_ph (__m256h __A, __m256h __B, __m256h __C) +{ + return (__m256h) __builtin_ia32_vfmsubph256_mask ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C, + (__mmask16) -1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fmsub_ph (__m256h __A, __mmask16 __U, __m256h __B, + __m256h __C) +{ + return (__m256h) __builtin_ia32_vfmsubph256_mask ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C, + (__mmask16) __U); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fmsub_ph (__m256h __A, __m256h __B, __m256h __C, + __mmask16 __U) +{ + return (__m256h) __builtin_ia32_vfmsubph256_mask3 ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C, + (__mmask16) + __U); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fmsub_ph (__mmask16 __U, __m256h __A, __m256h __B, + __m256h __C) +{ + return (__m256h) __builtin_ia32_vfmsubph256_maskz ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C, + (__mmask16) + __U); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmsub_ph (__m128h __A, __m128h __B, __m128h __C) +{ + return (__m128h) __builtin_ia32_vfmsubph128_mask ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmsub_ph (__m128h __A, __mmask8 __U, __m128h __B, + __m128h __C) +{ + return (__m128h) __builtin_ia32_vfmsubph128_mask ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + (__mmask8) __U); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmsub_ph (__m128h __A, __m128h __B, __m128h __C, + __mmask8 __U) +{ + return (__m128h) __builtin_ia32_vfmsubph128_mask3 ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + (__mmask8) + __U); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmsub_ph (__mmask8 __U, __m128h __A, __m128h __B, + __m128h __C) +{ + return (__m128h) __builtin_ia32_vfmsubph128_maskz ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + (__mmask8) + __U); +} + +/* Intrinsics vfnmsub[132,213,231]ph. */ +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fnmsub_ph (__m256h __A, __m256h __B, __m256h __C) +{ + return (__m256h) __builtin_ia32_vfnmsubph256_mask ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C, + (__mmask16) -1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fnmsub_ph (__m256h __A, __mmask16 __U, __m256h __B, + __m256h __C) +{ + return (__m256h) __builtin_ia32_vfnmsubph256_mask ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C, + (__mmask16) __U); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fnmsub_ph (__m256h __A, __m256h __B, __m256h __C, + __mmask16 __U) +{ + return (__m256h) __builtin_ia32_vfnmsubph256_mask3 ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C, + (__mmask16) + __U); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fnmsub_ph (__mmask16 __U, __m256h __A, __m256h __B, + __m256h __C) +{ + return (__m256h) __builtin_ia32_vfnmsubph256_maskz ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C, + (__mmask16) + __U); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmsub_ph (__m128h __A, __m128h __B, __m128h __C) +{ + return (__m128h) __builtin_ia32_vfnmsubph128_mask ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + (__mmask8) -1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmsub_ph (__m128h __A, __mmask8 __U, __m128h __B, + __m128h __C) +{ + return (__m128h) __builtin_ia32_vfnmsubph128_mask ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + (__mmask8) __U); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmsub_ph (__m128h __A, __m128h __B, __m128h __C, + __mmask8 __U) +{ + return (__m128h) __builtin_ia32_vfnmsubph128_mask3 ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + (__mmask8) + __U); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmsub_ph (__mmask8 __U, __m128h __A, __m128h __B, + __m128h __C) +{ + return (__m128h) __builtin_ia32_vfnmsubph128_maskz ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, + (__mmask8) + __U); +} + +/* Intrinsics vf[,c]maddcph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmadd_pch (__m128h __A, __m128h __B, __m128h __C) +{ + return (__m128h) __builtin_ia32_vfmaddcph128 ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmadd_pch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return (__m128h) + __builtin_ia32_vfmaddcph128_mask ((__v8hf) __A, + (__v8hf) __C, + (__v8hf) __D, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmadd_pch (__m128h __A, __m128h __B, __m128h __C, __mmask8 __D) +{ + return (__m128h) + __builtin_ia32_vfmaddcph128_mask3 ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, __D); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmadd_pch (__mmask8 __A, __m128h __B, __m128h __C, __m128h __D) +{ + return (__m128h) __builtin_ia32_vfmaddcph128_maskz ((__v8hf) __B, + (__v8hf) __C, + (__v8hf) __D, __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fmadd_pch (__m256h __A, __m256h __B, __m256h __C) +{ + return (__m256h) __builtin_ia32_vfmaddcph256 ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fmadd_pch (__m256h __A, __mmask8 __B, __m256h __C, __m256h __D) +{ + return (__m256h) + __builtin_ia32_vfmaddcph256_mask ((__v16hf) __A, + (__v16hf) __C, + (__v16hf) __D, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fmadd_pch (__m256h __A, __m256h __B, __m256h __C, __mmask8 __D) +{ + return (__m256h) + __builtin_ia32_vfmaddcph256_mask3 ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C, __D); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fmadd_pch (__mmask8 __A, __m256h __B, __m256h __C, __m256h __D) +{ + return (__m256h)__builtin_ia32_vfmaddcph256_maskz ((__v16hf) __B, + (__v16hf) __C, + (__v16hf) __D, __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fcmadd_pch (__m128h __A, __m128h __B, __m128h __C) +{ + return (__m128h) __builtin_ia32_vfcmaddcph128 ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fcmadd_pch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return (__m128h) + __builtin_ia32_vfcmaddcph128_mask ((__v8hf) __A, + (__v8hf) __C, + (__v8hf) __D, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fcmadd_pch (__m128h __A, __m128h __B, __m128h __C, __mmask8 __D) +{ + return (__m128h) + __builtin_ia32_vfcmaddcph128_mask3 ((__v8hf) __A, + (__v8hf) __B, + (__v8hf) __C, __D); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fcmadd_pch (__mmask8 __A, __m128h __B, __m128h __C, __m128h __D) +{ + return (__m128h)__builtin_ia32_vfcmaddcph128_maskz ((__v8hf) __B, + (__v8hf) __C, + (__v8hf) __D, __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fcmadd_pch (__m256h __A, __m256h __B, __m256h __C) +{ + return (__m256h) __builtin_ia32_vfcmaddcph256 ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fcmadd_pch (__m256h __A, __mmask8 __B, __m256h __C, __m256h __D) +{ + return (__m256h) + __builtin_ia32_vfcmaddcph256_mask ((__v16hf) __A, + (__v16hf) __C, + (__v16hf) __D, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fcmadd_pch (__m256h __A, __m256h __B, __m256h __C, __mmask8 __D) +{ + return (__m256h) + __builtin_ia32_vfcmaddcph256_mask3 ((__v16hf) __A, + (__v16hf) __B, + (__v16hf) __C, __D); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fcmadd_pch (__mmask8 __A, __m256h __B, __m256h __C, __m256h __D) +{ + return (__m256h) __builtin_ia32_vfcmaddcph256_maskz ((__v16hf) __B, + (__v16hf) __C, + (__v16hf) __D, __A); +} + +/* Intrinsics vf[,c]mulcph. */ +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmul_pch (__m128h __A, __m128h __B) +{ + return (__m128h) __builtin_ia32_vfmulcph128 ((__v8hf) __A, (__v8hf) __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmul_pch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return (__m128h) __builtin_ia32_vfmulcph128_mask ((__v8hf) __C, + (__v8hf) __D, + (__v8hf) __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmul_pch (__mmask8 __A, __m128h __B, __m128h __C) +{ + return (__m128h) __builtin_ia32_vfmulcph128_mask ((__v8hf) __B, + (__v8hf) __C, + _mm_setzero_ph (), + __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fmul_pch (__m256h __A, __m256h __B) +{ + return (__m256h) __builtin_ia32_vfmulcph256 ((__v16hf) __A, + (__v16hf) __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fmul_pch (__m256h __A, __mmask8 __B, __m256h __C, __m256h __D) +{ + return (__m256h) __builtin_ia32_vfmulcph256_mask ((__v16hf) __C, + (__v16hf) __D, + (__v16hf) __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fmul_pch (__mmask8 __A, __m256h __B, __m256h __C) +{ + return (__m256h) __builtin_ia32_vfmulcph256_mask ((__v16hf) __B, + (__v16hf) __C, + _mm256_setzero_ph (), + __A); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fcmul_pch (__m128h __A, __m128h __B) +{ + return (__m128h) __builtin_ia32_vfcmulcph128 ((__v8hf) __A, + (__v8hf) __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fcmul_pch (__m128h __A, __mmask8 __B, __m128h __C, __m128h __D) +{ + return (__m128h) __builtin_ia32_vfcmulcph128_mask ((__v8hf) __C, + (__v8hf) __D, + (__v8hf) __A, __B); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fcmul_pch (__mmask8 __A, __m128h __B, __m128h __C) +{ + return (__m128h) __builtin_ia32_vfcmulcph128_mask ((__v8hf) __B, + (__v8hf) __C, + _mm_setzero_ph (), + __A); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fcmul_pch (__m256h __A, __m256h __B) +{ + return (__m256h) __builtin_ia32_vfcmulcph256 ((__v16hf) __A, (__v16hf) __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fcmul_pch (__m256h __A, __mmask8 __B, __m256h __C, __m256h __D) +{ + return (__m256h) __builtin_ia32_vfcmulcph256_mask ((__v16hf) __C, + (__v16hf) __D, + (__v16hf) __A, __B); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fcmul_pch (__mmask8 __A, __m256h __B, __m256h __C) +{ + return (__m256h) __builtin_ia32_vfcmulcph256_mask ((__v16hf) __B, + (__v16hf) __C, + _mm256_setzero_ph (), + __A); +} + +#define _MM256_REDUCE_OP(op) \ + __m128h __T1 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 0); \ + __m128h __T2 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 1); \ + __m128h __T3 = (__T1 op __T2); \ + __m128h __T4 = (__m128h) __builtin_shuffle (__T3, \ + (__v8hi) { 4, 5, 6, 7, 0, 1, 2, 3 }); \ + __m128h __T5 = (__T3) op (__T4); \ + __m128h __T6 = (__m128h) __builtin_shuffle (__T5, \ + (__v8hi) { 2, 3, 0, 1, 4, 5, 6, 7 }); \ + __m128h __T7 = __T5 op __T6; \ + return __T7[0] op __T7[1] + +extern __inline _Float16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reduce_add_ph (__m256h __A) +{ + _MM256_REDUCE_OP (+); +} + +extern __inline _Float16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reduce_mul_ph (__m256h __A) +{ + _MM256_REDUCE_OP (*); +} + +#undef _MM256_REDUCE_OP +#define _MM256_REDUCE_OP(op) \ + __m128h __T1 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 0); \ + __m128h __T2 = (__m128h) _mm256_extractf128_pd ((__m256d) __A, 1); \ + __m128h __T3 = _mm_##op (__T1, __T2); \ + __m128h __T4 = (__m128h) __builtin_shuffle (__T3, \ + (__v8hi) { 2, 3, 0, 1, 6, 7, 4, 5 }); \ + __m128h __T5 = _mm_##op (__T3, __T4); \ + __m128h __T6 = (__m128h) __builtin_shuffle (__T5, (__v8hi) { 4, 5 }); \ + __m128h __T7 = _mm_##op (__T5, __T6); \ + __m128h __T8 = (__m128h) __builtin_shuffle (__T7, (__v8hi) { 1, 0 }); \ + __m128h __T9 = _mm_##op (__T7, __T8); \ + return __T9[0] + +extern __inline _Float16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reduce_min_ph (__m256h __A) +{ + _MM256_REDUCE_OP (min_ph); +} + +extern __inline _Float16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reduce_max_ph (__m256h __A) +{ + _MM256_REDUCE_OP (max_ph); +} + +#define _MM_REDUCE_OP(op) \ + __m128h __T1 = (__m128h) __builtin_shuffle (__A, \ + (__v8hi) { 4, 5, 6, 7, 0, 1, 2, 3 }); \ + __m128h __T2 = (__A) op (__T1); \ + __m128h __T3 = (__m128h) __builtin_shuffle (__T2, \ + (__v8hi){ 2, 3, 0, 1, 4, 5, 6, 7 }); \ + __m128h __T4 = __T2 op __T3; \ + return __T4[0] op __T4[1] + +extern __inline _Float16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_add_ph (__m128h __A) +{ + _MM_REDUCE_OP (+); +} + +extern __inline _Float16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_mul_ph (__m128h __A) +{ + _MM_REDUCE_OP (*); +} + +#undef _MM_REDUCE_OP +#define _MM_REDUCE_OP(op) \ + __m128h __T1 = (__m128h) __builtin_shuffle (__A, \ + (__v8hi) { 2, 3, 0, 1, 6, 7, 4, 5 }); \ + __m128h __T2 = _mm_##op (__A, __T1); \ + __m128h __T3 = (__m128h) __builtin_shuffle (__T2, (__v8hi){ 4, 5 }); \ + __m128h __T4 = _mm_##op (__T2, __T3); \ + __m128h __T5 = (__m128h) __builtin_shuffle (__T4, (__v8hi){ 1, 0 }); \ + __m128h __T6 = _mm_##op (__T4, __T5); \ + return __T6[0] + +extern __inline _Float16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_min_ph (__m128h __A) +{ + _MM_REDUCE_OP (min_ph); +} + +extern __inline _Float16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_max_ph (__m128h __A) +{ + _MM_REDUCE_OP (max_ph); +} + +#undef _MM256_REDUCE_OP +#undef _MM_REDUCE_OP + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_blend_ph (__mmask16 __U, __m256h __A, __m256h __W) +{ + return (__m256h) __builtin_ia32_movdquhi256_mask ((__v16hi) __W, + (__v16hi) __A, + (__mmask16) __U); + +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutex2var_ph (__m256h __A, __m256i __I, __m256h __B) +{ + return (__m256h) __builtin_ia32_vpermi2varhi256_mask ((__v16hi) __A, + (__v16hi) __I, + (__v16hi) __B, + (__mmask16)-1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutexvar_ph (__m256i __A, __m256h __B) +{ + return (__m256h) __builtin_ia32_permvarhi256_mask ((__v16hi) __B, + (__v16hi) __A, + (__v16hi) + (_mm256_setzero_ph ()), + (__mmask16)-1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_blend_ph (__mmask8 __U, __m128h __A, __m128h __W) +{ + return (__m128h) __builtin_ia32_movdquhi128_mask ((__v8hi) __W, + (__v8hi) __A, + (__mmask8) __U); + +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permutex2var_ph (__m128h __A, __m128i __I, __m128h __B) +{ + return (__m128h) __builtin_ia32_vpermi2varhi128_mask ((__v8hi) __A, + (__v8hi) __I, + (__v8hi) __B, + (__mmask8)-1); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permutexvar_ph (__m128i __A, __m128h __B) +{ + return (__m128h) __builtin_ia32_permvarhi128_mask ((__v8hi) __B, + (__v8hi) __A, + (__v8hi) + (_mm_setzero_ph ()), + (__mmask8)-1); +} + +extern __inline __m256h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set1_pch (_Float16 _Complex __A) +{ + union + { + _Float16 _Complex __a; + float __b; + } __u = { .__a = __A }; + + return (__m256h) _mm256_set1_ps (__u.__b); +} + +extern __inline __m128h +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_pch (_Float16 _Complex __A) +{ + union + { + _Float16 _Complex __a; + float __b; + } __u = { .__a = __A }; + + return (__m128h) _mm_set1_ps (__u.__b); +} + +// intrinsics below are alias for f*mul_*ch +#define _mm_mul_pch(A, B) _mm_fmul_pch ((A), (B)) +#define _mm_mask_mul_pch(W, U, A, B) _mm_mask_fmul_pch ((W), (U), (A), (B)) +#define _mm_maskz_mul_pch(U, A, B) _mm_maskz_fmul_pch ((U), (A), (B)) +#define _mm256_mul_pch(A, B) _mm256_fmul_pch ((A), (B)) +#define _mm256_mask_mul_pch(W, U, A, B) \ + _mm256_mask_fmul_pch ((W), (U), (A), (B)) +#define _mm256_maskz_mul_pch(U, A, B) _mm256_maskz_fmul_pch ((U), (A), (B)) + +#define _mm_cmul_pch(A, B) _mm_fcmul_pch ((A), (B)) +#define _mm_mask_cmul_pch(W, U, A, B) _mm_mask_fcmul_pch ((W), (U), (A), (B)) +#define _mm_maskz_cmul_pch(U, A, B) _mm_maskz_fcmul_pch ((U), (A), (B)) +#define _mm256_cmul_pch(A, B) _mm256_fcmul_pch ((A), (B)) +#define _mm256_mask_cmul_pch(W, U, A, B) \ + _mm256_mask_fcmul_pch ((W), (U), (A), (B)) +#define _mm256_maskz_cmul_pch(U, A, B) _mm256_maskz_fcmul_pch((U), (A), (B)) + +#ifdef __DISABLE_AVX512FP16VL__ +#undef __DISABLE_AVX512FP16VL__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512FP16VL__ */ + +#endif /* __AVX512FP16VLINTRIN_H_INCLUDED */ diff --git a/include-gcc/avx512ifmaintrin.h b/include-gcc/avx512ifmaintrin.h new file mode 100644 index 0000000..fc97f1d --- /dev/null +++ b/include-gcc/avx512ifmaintrin.h @@ -0,0 +1,104 @@ +/* Copyright (C) 2013-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512IFMAINTRIN_H_INCLUDED +#define _AVX512IFMAINTRIN_H_INCLUDED + +#ifndef __AVX512IFMA__ +#pragma GCC push_options +#pragma GCC target("avx512ifma") +#define __DISABLE_AVX512IFMA__ +#endif /* __AVX512IFMA__ */ + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_madd52lo_epu64 (__m512i __X, __m512i __Y, __m512i __Z) +{ + return (__m512i) __builtin_ia32_vpmadd52luq512_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) __Z, + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z) +{ + return (__m512i) __builtin_ia32_vpmadd52huq512_mask ((__v8di) __X, + (__v8di) __Y, + (__v8di) __Z, + (__mmask8) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X, + __m512i __Y) +{ + return (__m512i) __builtin_ia32_vpmadd52luq512_mask ((__v8di) __W, + (__v8di) __X, + (__v8di) __Y, + (__mmask8) __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X, + __m512i __Y) +{ + return (__m512i) __builtin_ia32_vpmadd52huq512_mask ((__v8di) __W, + (__v8di) __X, + (__v8di) __Y, + (__mmask8) __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_madd52lo_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z) +{ + return (__m512i) __builtin_ia32_vpmadd52luq512_maskz ((__v8di) __X, + (__v8di) __Y, + (__v8di) __Z, + (__mmask8) __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_madd52hi_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z) +{ + return (__m512i) __builtin_ia32_vpmadd52huq512_maskz ((__v8di) __X, + (__v8di) __Y, + (__v8di) __Z, + (__mmask8) __M); +} + +#ifdef __DISABLE_AVX512IFMA__ +#undef __DISABLE_AVX512IFMA__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512IFMA__ */ + +#endif /* _AVX512IFMAINTRIN_H_INCLUDED */ diff --git a/include-gcc/avx512ifmavlintrin.h b/include-gcc/avx512ifmavlintrin.h new file mode 100644 index 0000000..cac55fe --- /dev/null +++ b/include-gcc/avx512ifmavlintrin.h @@ -0,0 +1,145 @@ +/* Copyright (C) 2013-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512IFMAVLINTRIN_H_INCLUDED +#define _AVX512IFMAVLINTRIN_H_INCLUDED + +#if !defined(__AVX512VL__) || !defined(__AVX512IFMA__) +#pragma GCC push_options +#pragma GCC target("avx512ifma,avx512vl") +#define __DISABLE_AVX512IFMAVL__ +#endif /* __AVX512IFMAVL__ */ + +#define _mm_madd52lo_epu64(A, B, C) \ + ((__m128i) __builtin_ia32_vpmadd52luq128 ((__v2di) (A), \ + (__v2di) (B), \ + (__v2di) (C))) + +#define _mm_madd52hi_epu64(A, B, C) \ + ((__m128i) __builtin_ia32_vpmadd52huq128 ((__v2di) (A), \ + (__v2di) (B), \ + (__v2di) (C))) + +#define _mm256_madd52lo_epu64(A, B, C) \ + ((__m256i) __builtin_ia32_vpmadd52luq256 ((__v4di) (A), \ + (__v4di) (B), \ + (__v4di) (C))) + + +#define _mm256_madd52hi_epu64(A, B, C) \ + ((__m256i) __builtin_ia32_vpmadd52huq256 ((__v4di) (A), \ + (__v4di) (B), \ + (__v4di) (C))) + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_vpmadd52luq128_mask ((__v2di) __W, + (__v2di) __X, + (__v2di) __Y, + (__mmask8) __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_vpmadd52huq128_mask ((__v2di) __W, + (__v2di) __X, + (__v2di) __Y, + (__mmask8) __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_vpmadd52luq256_mask ((__v4di) __W, + (__v4di) __X, + (__v4di) __Y, + (__mmask8) __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_vpmadd52huq256_mask ((__v4di) __W, + (__v4di) __X, + (__v4di) __Y, + (__mmask8) __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) +{ + return (__m128i) __builtin_ia32_vpmadd52luq128_maskz ((__v2di) __X, + (__v2di) __Y, + (__v2di) __Z, + (__mmask8) __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) +{ + return (__m128i) __builtin_ia32_vpmadd52huq128_maskz ((__v2di) __X, + (__v2di) __Y, + (__v2di) __Z, + (__mmask8) __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_madd52lo_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z) +{ + return (__m256i) __builtin_ia32_vpmadd52luq256_maskz ((__v4di) __X, + (__v4di) __Y, + (__v4di) __Z, + (__mmask8) __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z) +{ + return (__m256i) __builtin_ia32_vpmadd52huq256_maskz ((__v4di) __X, + (__v4di) __Y, + (__v4di) __Z, + (__mmask8) __M); +} + +#ifdef __DISABLE_AVX512IFMAVL__ +#undef __DISABLE_AVX512IFMAVL__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512IFMAVL__ */ + +#endif /* _AVX512IFMAVLINTRIN_H_INCLUDED */ diff --git a/include-gcc/avx512pfintrin.h b/include-gcc/avx512pfintrin.h new file mode 100644 index 0000000..a547610 --- /dev/null +++ b/include-gcc/avx512pfintrin.h @@ -0,0 +1,269 @@ +/* Copyright (C) 2013-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512PFINTRIN_H_INCLUDED +#define _AVX512PFINTRIN_H_INCLUDED + +#ifndef __AVX512PF__ +#pragma GCC push_options +#pragma GCC target("avx512pf") +#define __DISABLE_AVX512PF__ +#endif /* __AVX512PF__ */ + +/* Internal data types for implementing the intrinsics. */ +typedef long long __v8di __attribute__ ((__vector_size__ (64))); +typedef int __v16si __attribute__ ((__vector_size__ (64))); + +/* The Intel API is flexible enough that we must allow aliasing with other + vector types, and their scalar components. */ +typedef long long __m512i __attribute__ ((__vector_size__ (64), __may_alias__)); + +typedef unsigned char __mmask8; +typedef unsigned short __mmask16; + +#ifdef __OPTIMIZE__ +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_prefetch_i32gather_pd (__m256i __index, void const *__addr, + int __scale, int __hint) +{ + __builtin_ia32_gatherpfdpd ((__mmask8) 0xFF, (__v8si) __index, __addr, + __scale, __hint); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_prefetch_i32gather_ps (__m512i __index, void const *__addr, + int __scale, int __hint) +{ + __builtin_ia32_gatherpfdps ((__mmask16) 0xFFFF, (__v16si) __index, __addr, + __scale, __hint); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_prefetch_i32gather_pd (__m256i __index, __mmask8 __mask, + void const *__addr, int __scale, int __hint) +{ + __builtin_ia32_gatherpfdpd (__mask, (__v8si) __index, __addr, __scale, + __hint); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_prefetch_i32gather_ps (__m512i __index, __mmask16 __mask, + void const *__addr, int __scale, int __hint) +{ + __builtin_ia32_gatherpfdps (__mask, (__v16si) __index, __addr, __scale, + __hint); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_prefetch_i64gather_pd (__m512i __index, void const *__addr, + int __scale, int __hint) +{ + __builtin_ia32_gatherpfqpd ((__mmask8) 0xFF, (__v8di) __index, __addr, + __scale, __hint); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_prefetch_i64gather_ps (__m512i __index, void const *__addr, + int __scale, int __hint) +{ + __builtin_ia32_gatherpfqps ((__mmask8) 0xFF, (__v8di) __index, __addr, + __scale, __hint); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_prefetch_i64gather_pd (__m512i __index, __mmask8 __mask, + void const *__addr, int __scale, int __hint) +{ + __builtin_ia32_gatherpfqpd (__mask, (__v8di) __index, __addr, __scale, + __hint); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_prefetch_i64gather_ps (__m512i __index, __mmask8 __mask, + void const *__addr, int __scale, int __hint) +{ + __builtin_ia32_gatherpfqps (__mask, (__v8di) __index, __addr, __scale, + __hint); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_prefetch_i32scatter_pd (void *__addr, __m256i __index, int __scale, + int __hint) +{ + __builtin_ia32_scatterpfdpd ((__mmask8) 0xFF, (__v8si) __index, __addr, + __scale, __hint); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_prefetch_i32scatter_ps (void *__addr, __m512i __index, int __scale, + int __hint) +{ + __builtin_ia32_scatterpfdps ((__mmask16) 0xFFFF, (__v16si) __index, __addr, + __scale, __hint); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_prefetch_i32scatter_pd (void *__addr, __mmask8 __mask, + __m256i __index, int __scale, int __hint) +{ + __builtin_ia32_scatterpfdpd (__mask, (__v8si) __index, __addr, __scale, + __hint); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_prefetch_i32scatter_ps (void *__addr, __mmask16 __mask, + __m512i __index, int __scale, int __hint) +{ + __builtin_ia32_scatterpfdps (__mask, (__v16si) __index, __addr, __scale, + __hint); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_prefetch_i64scatter_pd (void *__addr, __m512i __index, int __scale, + int __hint) +{ + __builtin_ia32_scatterpfqpd ((__mmask8) 0xFF, (__v8di) __index,__addr, + __scale, __hint); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_prefetch_i64scatter_ps (void *__addr, __m512i __index, int __scale, + int __hint) +{ + __builtin_ia32_scatterpfqps ((__mmask8) 0xFF, (__v8di) __index, __addr, + __scale, __hint); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_prefetch_i64scatter_pd (void *__addr, __mmask8 __mask, + __m512i __index, int __scale, int __hint) +{ + __builtin_ia32_scatterpfqpd (__mask, (__v8di) __index, __addr, __scale, + __hint); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_prefetch_i64scatter_ps (void *__addr, __mmask8 __mask, + __m512i __index, int __scale, int __hint) +{ + __builtin_ia32_scatterpfqps (__mask, (__v8di) __index, __addr, __scale, + __hint); +} + +#else +#define _mm512_prefetch_i32gather_pd(INDEX, ADDR, SCALE, HINT) \ + __builtin_ia32_gatherpfdpd ((__mmask8)0xFF, (__v8si)(__m256i) (INDEX), \ + (void const *) (ADDR), (int) (SCALE), \ + (int) (HINT)) + +#define _mm512_prefetch_i32gather_ps(INDEX, ADDR, SCALE, HINT) \ + __builtin_ia32_gatherpfdps ((__mmask16)0xFFFF, (__v16si)(__m512i) (INDEX), \ + (void const *) (ADDR), (int) (SCALE), \ + (int) (HINT)) + +#define _mm512_mask_prefetch_i32gather_pd(INDEX, MASK, ADDR, SCALE, HINT) \ + __builtin_ia32_gatherpfdpd ((__mmask8) (MASK), (__v8si)(__m256i) (INDEX), \ + (void const *) (ADDR), (int) (SCALE), \ + (int) (HINT)) + +#define _mm512_mask_prefetch_i32gather_ps(INDEX, MASK, ADDR, SCALE, HINT) \ + __builtin_ia32_gatherpfdps ((__mmask16) (MASK), (__v16si)(__m512i) (INDEX),\ + (void const *) (ADDR), (int) (SCALE), \ + (int) (HINT)) + +#define _mm512_prefetch_i64gather_pd(INDEX, ADDR, SCALE, HINT) \ + __builtin_ia32_gatherpfqpd ((__mmask8)0xFF, (__v8di)(__m512i) (INDEX), \ + (void *) (ADDR), (int) (SCALE), (int) (HINT)) + +#define _mm512_prefetch_i64gather_ps(INDEX, ADDR, SCALE, HINT) \ + __builtin_ia32_gatherpfqps ((__mmask8)0xFF, (__v8di)(__m512i) (INDEX), \ + (void *) (ADDR), (int) (SCALE), (int) (HINT)) + +#define _mm512_mask_prefetch_i64gather_pd(INDEX, MASK, ADDR, SCALE, HINT) \ + __builtin_ia32_gatherpfqpd ((__mmask8) (MASK), (__v8di)(__m512i) (INDEX), \ + (void *) (ADDR), (int) (SCALE), (int) (HINT)) + +#define _mm512_mask_prefetch_i64gather_ps(INDEX, MASK, ADDR, SCALE, HINT) \ + __builtin_ia32_gatherpfqps ((__mmask8) (MASK), (__v8di)(__m512i) (INDEX), \ + (void *) (ADDR), (int) (SCALE), (int) (HINT)) + +#define _mm512_prefetch_i32scatter_pd(ADDR, INDEX, SCALE, HINT) \ + __builtin_ia32_scatterpfdpd ((__mmask8)0xFF, (__v8si)(__m256i) (INDEX), \ + (void *) (ADDR), (int) (SCALE), (int) (HINT)) + +#define _mm512_prefetch_i32scatter_ps(ADDR, INDEX, SCALE, HINT) \ + __builtin_ia32_scatterpfdps ((__mmask16)0xFFFF, (__v16si)(__m512i) (INDEX),\ + (void *) (ADDR), (int) (SCALE), (int) (HINT)) + +#define _mm512_mask_prefetch_i32scatter_pd(ADDR, MASK, INDEX, SCALE, HINT) \ + __builtin_ia32_scatterpfdpd ((__mmask8) (MASK), (__v8si)(__m256i) (INDEX), \ + (void *) (ADDR), (int) (SCALE), (int) (HINT)) + +#define _mm512_mask_prefetch_i32scatter_ps(ADDR, MASK, INDEX, SCALE, HINT) \ + __builtin_ia32_scatterpfdps ((__mmask16) (MASK), \ + (__v16si)(__m512i) (INDEX), \ + (void *) (ADDR), (int) (SCALE), (int) (HINT)) + +#define _mm512_prefetch_i64scatter_pd(ADDR, INDEX, SCALE, HINT) \ + __builtin_ia32_scatterpfqpd ((__mmask8)0xFF, (__v8di)(__m512i) (INDEX), \ + (void *) (ADDR), (int) (SCALE), (int) (HINT)) + +#define _mm512_prefetch_i64scatter_ps(ADDR, INDEX, SCALE, HINT) \ + __builtin_ia32_scatterpfqps ((__mmask8)0xFF, (__v8di)(__m512i) (INDEX), \ + (void *) (ADDR), (int) (SCALE), (int) (HINT)) + +#define _mm512_mask_prefetch_i64scatter_pd(ADDR, MASK, INDEX, SCALE, HINT) \ + __builtin_ia32_scatterpfqpd ((__mmask8) (MASK), (__v8di)(__m512i) (INDEX), \ + (void *) (ADDR), (int) (SCALE), (int) (HINT)) + +#define _mm512_mask_prefetch_i64scatter_ps(ADDR, MASK, INDEX, SCALE, HINT) \ + __builtin_ia32_scatterpfqps ((__mmask8) (MASK), (__v8di)(__m512i) (INDEX), \ + (void *) (ADDR), (int) (SCALE), (int) (HINT)) +#endif + +#ifdef __DISABLE_AVX512PF__ +#undef __DISABLE_AVX512PF__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512PF__ */ + +#endif /* _AVX512PFINTRIN_H_INCLUDED */ diff --git a/include-gcc/avx512vbmi2intrin.h b/include-gcc/avx512vbmi2intrin.h new file mode 100644 index 0000000..528d193 --- /dev/null +++ b/include-gcc/avx512vbmi2intrin.h @@ -0,0 +1,557 @@ +/* Copyright (C) 2013-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512VBMI2INTRIN_H_INCLUDED +#define __AVX512VBMI2INTRIN_H_INCLUDED + +#if !defined(__AVX512VBMI2__) +#pragma GCC push_options +#pragma GCC target("avx512vbmi2") +#define __DISABLE_AVX512VBMI2__ +#endif /* __AVX512VBMI2__ */ + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shrdi_epi16 (__m512i __A, __m512i __B, int __C) +{ + return (__m512i) __builtin_ia32_vpshrd_v32hi ((__v32hi)__A, (__v32hi) __B, + __C); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shrdi_epi32 (__m512i __A, __m512i __B, int __C) +{ + return (__m512i) __builtin_ia32_vpshrd_v16si ((__v16si)__A, (__v16si) __B, + __C); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shrdi_epi32 (__m512i __A, __mmask16 __B, __m512i __C, __m512i __D, + int __E) +{ + return (__m512i)__builtin_ia32_vpshrd_v16si_mask ((__v16si)__C, + (__v16si) __D, __E, (__v16si) __A, (__mmask16)__B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shrdi_epi32 (__mmask16 __A, __m512i __B, __m512i __C, int __D) +{ + return (__m512i)__builtin_ia32_vpshrd_v16si_mask ((__v16si)__B, + (__v16si) __C, __D, (__v16si) _mm512_setzero_si512 (), (__mmask16)__A); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shrdi_epi64 (__m512i __A, __m512i __B, int __C) +{ + return (__m512i) __builtin_ia32_vpshrd_v8di ((__v8di)__A, (__v8di) __B, __C); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shrdi_epi64 (__m512i __A, __mmask8 __B, __m512i __C, __m512i __D, + int __E) +{ + return (__m512i)__builtin_ia32_vpshrd_v8di_mask ((__v8di)__C, (__v8di) __D, + __E, (__v8di) __A, (__mmask8)__B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shrdi_epi64 (__mmask8 __A, __m512i __B, __m512i __C, int __D) +{ + return (__m512i)__builtin_ia32_vpshrd_v8di_mask ((__v8di)__B, (__v8di) __C, + __D, (__v8di) _mm512_setzero_si512 (), (__mmask8)__A); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shldi_epi16 (__m512i __A, __m512i __B, int __C) +{ + return (__m512i) __builtin_ia32_vpshld_v32hi ((__v32hi)__A, (__v32hi) __B, + __C); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shldi_epi32 (__m512i __A, __m512i __B, int __C) +{ + return (__m512i) __builtin_ia32_vpshld_v16si ((__v16si)__A, (__v16si) __B, + __C); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shldi_epi32 (__m512i __A, __mmask16 __B, __m512i __C, __m512i __D, + int __E) +{ + return (__m512i)__builtin_ia32_vpshld_v16si_mask ((__v16si)__C, + (__v16si) __D, __E, (__v16si) __A, (__mmask16)__B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shldi_epi32 (__mmask16 __A, __m512i __B, __m512i __C, int __D) +{ + return (__m512i)__builtin_ia32_vpshld_v16si_mask ((__v16si)__B, + (__v16si) __C, __D, (__v16si) _mm512_setzero_si512 (), (__mmask16)__A); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shldi_epi64 (__m512i __A, __m512i __B, int __C) +{ + return (__m512i) __builtin_ia32_vpshld_v8di ((__v8di)__A, (__v8di) __B, __C); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shldi_epi64 (__m512i __A, __mmask8 __B, __m512i __C, __m512i __D, + int __E) +{ + return (__m512i)__builtin_ia32_vpshld_v8di_mask ((__v8di)__C, (__v8di) __D, + __E, (__v8di) __A, (__mmask8)__B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shldi_epi64 (__mmask8 __A, __m512i __B, __m512i __C, int __D) +{ + return (__m512i)__builtin_ia32_vpshld_v8di_mask ((__v8di)__B, (__v8di) __C, + __D, (__v8di) _mm512_setzero_si512 (), (__mmask8)__A); +} +#else +#define _mm512_shrdi_epi16(A, B, C) \ + ((__m512i) __builtin_ia32_vpshrd_v32hi ((__v32hi)(__m512i)(A), \ + (__v32hi)(__m512i)(B),(int)(C))) +#define _mm512_shrdi_epi32(A, B, C) \ + ((__m512i) __builtin_ia32_vpshrd_v16si ((__v16si)(__m512i)(A), \ + (__v16si)(__m512i)(B),(int)(C))) +#define _mm512_mask_shrdi_epi32(A, B, C, D, E) \ + ((__m512i) __builtin_ia32_vpshrd_v16si_mask ((__v16si)(__m512i)(C), \ + (__v16si)(__m512i)(D), \ + (int)(E), \ + (__v16si)(__m512i)(A), \ + (__mmask16)(B))) +#define _mm512_maskz_shrdi_epi32(A, B, C, D) \ + ((__m512i) \ + __builtin_ia32_vpshrd_v16si_mask ((__v16si)(__m512i)(B), \ + (__v16si)(__m512i)(C),(int)(D), \ + (__v16si)(__m512i)_mm512_setzero_si512 (), \ + (__mmask16)(A))) +#define _mm512_shrdi_epi64(A, B, C) \ + ((__m512i) __builtin_ia32_vpshrd_v8di ((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B),(int)(C))) +#define _mm512_mask_shrdi_epi64(A, B, C, D, E) \ + ((__m512i) __builtin_ia32_vpshrd_v8di_mask ((__v8di)(__m512i)(C), \ + (__v8di)(__m512i)(D), (int)(E), \ + (__v8di)(__m512i)(A), \ + (__mmask8)(B))) +#define _mm512_maskz_shrdi_epi64(A, B, C, D) \ + ((__m512i) \ + __builtin_ia32_vpshrd_v8di_mask ((__v8di)(__m512i)(B), \ + (__v8di)(__m512i)(C),(int)(D), \ + (__v8di)(__m512i)_mm512_setzero_si512 (), \ + (__mmask8)(A))) +#define _mm512_shldi_epi16(A, B, C) \ + ((__m512i) __builtin_ia32_vpshld_v32hi ((__v32hi)(__m512i)(A), \ + (__v32hi)(__m512i)(B),(int)(C))) +#define _mm512_shldi_epi32(A, B, C) \ + ((__m512i) __builtin_ia32_vpshld_v16si ((__v16si)(__m512i)(A), \ + (__v16si)(__m512i)(B),(int)(C))) +#define _mm512_mask_shldi_epi32(A, B, C, D, E) \ + ((__m512i) __builtin_ia32_vpshld_v16si_mask ((__v16si)(__m512i)(C), \ + (__v16si)(__m512i)(D), \ + (int)(E), \ + (__v16si)(__m512i)(A), \ + (__mmask16)(B))) +#define _mm512_maskz_shldi_epi32(A, B, C, D) \ + ((__m512i) \ + __builtin_ia32_vpshld_v16si_mask ((__v16si)(__m512i)(B), \ + (__v16si)(__m512i)(C),(int)(D), \ + (__v16si)(__m512i)_mm512_setzero_si512 (), \ + (__mmask16)(A))) +#define _mm512_shldi_epi64(A, B, C) \ + ((__m512i) __builtin_ia32_vpshld_v8di ((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), (int)(C))) +#define _mm512_mask_shldi_epi64(A, B, C, D, E) \ + ((__m512i) __builtin_ia32_vpshld_v8di_mask ((__v8di)(__m512i)(C), \ + (__v8di)(__m512i)(D), (int)(E), \ + (__v8di)(__m512i)(A), \ + (__mmask8)(B))) +#define _mm512_maskz_shldi_epi64(A, B, C, D) \ + ((__m512i) \ + __builtin_ia32_vpshld_v8di_mask ((__v8di)(__m512i)(B), \ + (__v8di)(__m512i)(C),(int)(D), \ + (__v8di)(__m512i)_mm512_setzero_si512 (), \ + (__mmask8)(A))) +#endif + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shrdv_epi16 (__m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_vpshrdv_v32hi ((__v32hi)__A, (__v32hi) __B, + (__v32hi) __C); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shrdv_epi32 (__m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_vpshrdv_v16si ((__v16si)__A, (__v16si) __B, + (__v16si) __C); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shrdv_epi32 (__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) +{ + return (__m512i)__builtin_ia32_vpshrdv_v16si_mask ((__v16si)__A, + (__v16si) __C, (__v16si) __D, (__mmask16)__B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shrdv_epi32 (__mmask16 __A, __m512i __B, __m512i __C, __m512i __D) +{ + return (__m512i)__builtin_ia32_vpshrdv_v16si_maskz ((__v16si)__B, + (__v16si) __C, (__v16si) __D, (__mmask16)__A); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shrdv_epi64 (__m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_vpshrdv_v8di ((__v8di)__A, (__v8di) __B, + (__v8di) __C); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shrdv_epi64 (__m512i __A, __mmask8 __B, __m512i __C, __m512i __D) +{ + return (__m512i)__builtin_ia32_vpshrdv_v8di_mask ((__v8di)__A, (__v8di) __C, + (__v8di) __D, (__mmask8)__B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shrdv_epi64 (__mmask8 __A, __m512i __B, __m512i __C, __m512i __D) +{ + return (__m512i)__builtin_ia32_vpshrdv_v8di_maskz ((__v8di)__B, (__v8di) __C, + (__v8di) __D, (__mmask8)__A); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shldv_epi16 (__m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_vpshldv_v32hi ((__v32hi)__A, (__v32hi) __B, + (__v32hi) __C); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shldv_epi32 (__m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_vpshldv_v16si ((__v16si)__A, (__v16si) __B, + (__v16si) __C); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shldv_epi32 (__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) +{ + return (__m512i)__builtin_ia32_vpshldv_v16si_mask ((__v16si)__A, + (__v16si) __C, (__v16si) __D, (__mmask16)__B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shldv_epi32 (__mmask16 __A, __m512i __B, __m512i __C, __m512i __D) +{ + return (__m512i)__builtin_ia32_vpshldv_v16si_maskz ((__v16si)__B, + (__v16si) __C, (__v16si) __D, (__mmask16)__A); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_shldv_epi64 (__m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_vpshldv_v8di ((__v8di)__A, (__v8di) __B, + (__v8di) __C); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shldv_epi64 (__m512i __A, __mmask8 __B, __m512i __C, __m512i __D) +{ + return (__m512i)__builtin_ia32_vpshldv_v8di_mask ((__v8di)__A, (__v8di) __C, + (__v8di) __D, (__mmask8)__B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shldv_epi64 (__mmask8 __A, __m512i __B, __m512i __C, __m512i __D) +{ + return (__m512i)__builtin_ia32_vpshldv_v8di_maskz ((__v8di)__B, (__v8di) __C, + (__v8di) __D, (__mmask8)__A); +} + +#ifdef __DISABLE_AVX512VBMI2__ +#undef __DISABLE_AVX512VBMI2__ + +#pragma GCC pop_options +#endif /* __DISABLE_AVX512VBMI2__ */ + +#if !defined(__AVX512VBMI2__) || !defined(__AVX512BW__) +#pragma GCC push_options +#pragma GCC target("avx512vbmi2,avx512bw") +#define __DISABLE_AVX512VBMI2BW__ +#endif /* __AVX512VBMI2BW__ */ + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_compress_epi8 (__m512i __A, __mmask64 __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi)__C, + (__v64qi)__A, (__mmask64)__B); +} + + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_compress_epi8 (__mmask64 __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi)__B, + (__v64qi)_mm512_setzero_si512 (), (__mmask64)__A); +} + + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_compressstoreu_epi8 (void * __A, __mmask64 __B, __m512i __C) +{ + __builtin_ia32_compressstoreuqi512_mask ((__v64qi *) __A, (__v64qi) __C, + (__mmask64) __B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_compress_epi16 (__m512i __A, __mmask32 __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi)__C, + (__v32hi)__A, (__mmask32)__B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_compress_epi16 (__mmask32 __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi)__B, + (__v32hi)_mm512_setzero_si512 (), (__mmask32)__A); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_compressstoreu_epi16 (void * __A, __mmask32 __B, __m512i __C) +{ + __builtin_ia32_compressstoreuhi512_mask ((__v32hi *) __A, (__v32hi) __C, + (__mmask32) __B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_expand_epi8 (__m512i __A, __mmask64 __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __C, + (__v64qi) __A, + (__mmask64) __B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_expand_epi8 (__mmask64 __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_expandqi512_maskz ((__v64qi) __B, + (__v64qi) _mm512_setzero_si512 (), (__mmask64) __A); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_expandloadu_epi8 (__m512i __A, __mmask64 __B, const void * __C) +{ + return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *) __C, + (__v64qi) __A, (__mmask64) __B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_expandloadu_epi8 (__mmask64 __A, const void * __B) +{ + return (__m512i) __builtin_ia32_expandloadqi512_maskz ((const __v64qi *) __B, + (__v64qi) _mm512_setzero_si512 (), (__mmask64) __A); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_expand_epi16 (__m512i __A, __mmask32 __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __C, + (__v32hi) __A, + (__mmask32) __B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_expand_epi16 (__mmask32 __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_expandhi512_maskz ((__v32hi) __B, + (__v32hi) _mm512_setzero_si512 (), (__mmask32) __A); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_expandloadu_epi16 (__m512i __A, __mmask32 __B, const void * __C) +{ + return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *) __C, + (__v32hi) __A, (__mmask32) __B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_expandloadu_epi16 (__mmask32 __A, const void * __B) +{ + return (__m512i) __builtin_ia32_expandloadhi512_maskz ((const __v32hi *) __B, + (__v32hi) _mm512_setzero_si512 (), (__mmask32) __A); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shrdi_epi16 (__m512i __A, __mmask32 __B, __m512i __C, __m512i __D, + int __E) +{ + return (__m512i)__builtin_ia32_vpshrd_v32hi_mask ((__v32hi)__C, + (__v32hi) __D, __E, (__v32hi) __A, (__mmask32)__B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shrdi_epi16 (__mmask32 __A, __m512i __B, __m512i __C, int __D) +{ + return (__m512i)__builtin_ia32_vpshrd_v32hi_mask ((__v32hi)__B, + (__v32hi) __C, __D, (__v32hi) _mm512_setzero_si512 (), (__mmask32)__A); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shldi_epi16 (__m512i __A, __mmask32 __B, __m512i __C, __m512i __D, + int __E) +{ + return (__m512i)__builtin_ia32_vpshld_v32hi_mask ((__v32hi)__C, + (__v32hi) __D, __E, (__v32hi) __A, (__mmask32)__B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shldi_epi16 (__mmask32 __A, __m512i __B, __m512i __C, int __D) +{ + return (__m512i)__builtin_ia32_vpshld_v32hi_mask ((__v32hi)__B, + (__v32hi) __C, __D, (__v32hi) _mm512_setzero_si512 (), (__mmask32)__A); +} + +#else +#define _mm512_mask_shrdi_epi16(A, B, C, D, E) \ + ((__m512i) __builtin_ia32_vpshrd_v32hi_mask ((__v32hi)(__m512i)(C), \ + (__v32hi)(__m512i)(D), \ + (int)(E), \ + (__v32hi)(__m512i)(A), \ + (__mmask32)(B))) +#define _mm512_maskz_shrdi_epi16(A, B, C, D) \ + ((__m512i) \ + __builtin_ia32_vpshrd_v32hi_mask ((__v32hi)(__m512i)(B), \ + (__v32hi)(__m512i)(C),(int)(D), \ + (__v32hi)(__m512i)_mm512_setzero_si512 (), \ + (__mmask32)(A))) +#define _mm512_mask_shldi_epi16(A, B, C, D, E) \ + ((__m512i) __builtin_ia32_vpshld_v32hi_mask ((__v32hi)(__m512i)(C), \ + (__v32hi)(__m512i)(D), \ + (int)(E), \ + (__v32hi)(__m512i)(A), \ + (__mmask32)(B))) +#define _mm512_maskz_shldi_epi16(A, B, C, D) \ + ((__m512i) \ + __builtin_ia32_vpshld_v32hi_mask ((__v32hi)(__m512i)(B), \ + (__v32hi)(__m512i)(C),(int)(D), \ + (__v32hi)(__m512i)_mm512_setzero_si512 (), \ + (__mmask32)(A))) +#endif + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shrdv_epi16 (__m512i __A, __mmask32 __B, __m512i __C, __m512i __D) +{ + return (__m512i)__builtin_ia32_vpshrdv_v32hi_mask ((__v32hi)__A, + (__v32hi) __C, (__v32hi) __D, (__mmask32)__B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shrdv_epi16 (__mmask32 __A, __m512i __B, __m512i __C, __m512i __D) +{ + return (__m512i)__builtin_ia32_vpshrdv_v32hi_maskz ((__v32hi)__B, + (__v32hi) __C, (__v32hi) __D, (__mmask32)__A); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_shldv_epi16 (__m512i __A, __mmask32 __B, __m512i __C, __m512i __D) +{ + return (__m512i)__builtin_ia32_vpshldv_v32hi_mask ((__v32hi)__A, + (__v32hi) __C, (__v32hi) __D, (__mmask32)__B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_shldv_epi16 (__mmask32 __A, __m512i __B, __m512i __C, __m512i __D) +{ + return (__m512i)__builtin_ia32_vpshldv_v32hi_maskz ((__v32hi)__B, + (__v32hi) __C, (__v32hi) __D, (__mmask32)__A); +} + +#ifdef __DISABLE_AVX512VBMI2BW__ +#undef __DISABLE_AVX512VBMI2BW__ + +#pragma GCC pop_options +#endif /* __DISABLE_AVX512VBMI2BW__ */ + +#endif /* __AVX512VBMI2INTRIN_H_INCLUDED */ diff --git a/include-gcc/avx512vbmi2vlintrin.h b/include-gcc/avx512vbmi2vlintrin.h new file mode 100644 index 0000000..86efca2 --- /dev/null +++ b/include-gcc/avx512vbmi2vlintrin.h @@ -0,0 +1,1037 @@ +/* Copyright (C) 2013-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512VBMI2VLINTRIN_H_INCLUDED +#define _AVX512VBMI2VLINTRIN_H_INCLUDED + +#if !defined(__AVX512VL__) || !defined(__AVX512VBMI2__) +#pragma GCC push_options +#pragma GCC target("avx512vbmi2,avx512vl") +#define __DISABLE_AVX512VBMI2VL__ +#endif /* __AVX512VBMIVL__ */ + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_compress_epi8 (__m128i __A, __mmask16 __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi)__C, + (__v16qi)__A, (__mmask16)__B); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_compress_epi8 (__mmask16 __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __B, + (__v16qi) _mm_setzero_si128 (), (__mmask16) __A); +} + + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_compressstoreu_epi16 (void * __A, __mmask16 __B, __m256i __C) +{ + __builtin_ia32_compressstoreuhi256_mask ((__v16hi *) __A, (__v16hi) __C, + (__mmask16) __B); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_compress_epi16 (__m128i __A, __mmask8 __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi)__C, (__v8hi)__A, + (__mmask8)__B); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_compress_epi16 (__mmask8 __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __B, + (__v8hi) _mm_setzero_si128 (), (__mmask8) __A); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_compress_epi16 (__m256i __A, __mmask16 __B, __m256i __C) +{ + return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi)__C, + (__v16hi)__A, (__mmask16)__B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_compress_epi16 (__mmask16 __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __B, + (__v16hi) _mm256_setzero_si256 (), (__mmask16) __A); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_compressstoreu_epi8 (void * __A, __mmask16 __B, __m128i __C) +{ + __builtin_ia32_compressstoreuqi128_mask ((__v16qi *) __A, (__v16qi) __C, + (__mmask16) __B); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_compressstoreu_epi16 (void * __A, __mmask8 __B, __m128i __C) +{ + __builtin_ia32_compressstoreuhi128_mask ((__v8hi *) __A, (__v8hi) __C, + (__mmask8) __B); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_expand_epi8 (__m128i __A, __mmask16 __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __C, + (__v16qi) __A, + (__mmask16) __B); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_expand_epi8 (__mmask16 __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_expandqi128_maskz ((__v16qi) __B, + (__v16qi) _mm_setzero_si128 (), (__mmask16) __A); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_expandloadu_epi8 (__m128i __A, __mmask16 __B, const void * __C) +{ + return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *) __C, + (__v16qi) __A, (__mmask16) __B); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_expandloadu_epi8 (__mmask16 __A, const void * __B) +{ + return (__m128i) __builtin_ia32_expandloadqi128_maskz ((const __v16qi *) __B, + (__v16qi) _mm_setzero_si128 (), (__mmask16) __A); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_expand_epi16 (__m128i __A, __mmask8 __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __C, + (__v8hi) __A, + (__mmask8) __B); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_expand_epi16 (__mmask8 __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_expandhi128_maskz ((__v8hi) __B, + (__v8hi) _mm_setzero_si128 (), (__mmask8) __A); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_expandloadu_epi16 (__m128i __A, __mmask8 __B, const void * __C) +{ + return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *) __C, + (__v8hi) __A, (__mmask8) __B); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_expandloadu_epi16 (__mmask8 __A, const void * __B) +{ + return (__m128i) __builtin_ia32_expandloadhi128_maskz ((const __v8hi *) __B, + (__v8hi) _mm_setzero_si128 (), (__mmask8) __A); +} +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_expand_epi16 (__m256i __A, __mmask16 __B, __m256i __C) +{ + return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __C, + (__v16hi) __A, + (__mmask16) __B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_expand_epi16 (__mmask16 __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_expandhi256_maskz ((__v16hi) __B, + (__v16hi) _mm256_setzero_si256 (), (__mmask16) __A); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_expandloadu_epi16 (__m256i __A, __mmask16 __B, const void * __C) +{ + return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *) __C, + (__v16hi) __A, (__mmask16) __B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_expandloadu_epi16 (__mmask16 __A, const void * __B) +{ + return (__m256i) __builtin_ia32_expandloadhi256_maskz ((const __v16hi *) __B, + (__v16hi) _mm256_setzero_si256 (), (__mmask16) __A); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shrdi_epi16 (__m256i __A, __m256i __B, int __C) +{ + return (__m256i) __builtin_ia32_vpshrd_v16hi ((__v16hi)__A, (__v16hi) __B, + __C); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shrdi_epi16 (__m256i __A, __mmask16 __B, __m256i __C, __m256i __D, + int __E) +{ + return (__m256i)__builtin_ia32_vpshrd_v16hi_mask ((__v16hi)__C, + (__v16hi) __D, __E, (__v16hi) __A, (__mmask16)__B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shrdi_epi16 (__mmask16 __A, __m256i __B, __m256i __C, int __D) +{ + return (__m256i)__builtin_ia32_vpshrd_v16hi_mask ((__v16hi)__B, + (__v16hi) __C, __D, (__v16hi) _mm256_setzero_si256 (), (__mmask16)__A); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shrdi_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D, + int __E) +{ + return (__m256i)__builtin_ia32_vpshrd_v8si_mask ((__v8si)__C, (__v8si) __D, + __E, (__v8si) __A, (__mmask8)__B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shrdi_epi32 (__mmask8 __A, __m256i __B, __m256i __C, int __D) +{ + return (__m256i)__builtin_ia32_vpshrd_v8si_mask ((__v8si)__B, (__v8si) __C, + __D, (__v8si) _mm256_setzero_si256 (), (__mmask8)__A); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shrdi_epi32 (__m256i __A, __m256i __B, int __C) +{ + return (__m256i) __builtin_ia32_vpshrd_v8si ((__v8si)__A, (__v8si) __B, __C); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shrdi_epi64 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D, + int __E) +{ + return (__m256i)__builtin_ia32_vpshrd_v4di_mask ((__v4di)__C, (__v4di) __D, + __E, (__v4di) __A, (__mmask8)__B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shrdi_epi64 (__mmask8 __A, __m256i __B, __m256i __C, int __D) +{ + return (__m256i)__builtin_ia32_vpshrd_v4di_mask ((__v4di)__B, (__v4di) __C, + __D, (__v4di) _mm256_setzero_si256 (), (__mmask8)__A); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shrdi_epi64 (__m256i __A, __m256i __B, int __C) +{ + return (__m256i) __builtin_ia32_vpshrd_v4di ((__v4di)__A, (__v4di) __B, __C); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shrdi_epi16 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D, + int __E) +{ + return (__m128i)__builtin_ia32_vpshrd_v8hi_mask ((__v8hi)__C, (__v8hi) __D, + __E, (__v8hi) __A, (__mmask8)__B); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shrdi_epi16 (__mmask8 __A, __m128i __B, __m128i __C, int __D) +{ + return (__m128i)__builtin_ia32_vpshrd_v8hi_mask ((__v8hi)__B, (__v8hi) __C, + __D, (__v8hi) _mm_setzero_si128 (), (__mmask8)__A); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shrdi_epi16 (__m128i __A, __m128i __B, int __C) +{ + return (__m128i) __builtin_ia32_vpshrd_v8hi ((__v8hi)__A, (__v8hi) __B, __C); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shrdi_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D, + int __E) +{ + return (__m128i)__builtin_ia32_vpshrd_v4si_mask ((__v4si)__C, (__v4si) __D, + __E, (__v4si) __A, (__mmask8)__B); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shrdi_epi32 (__mmask8 __A, __m128i __B, __m128i __C, int __D) +{ + return (__m128i)__builtin_ia32_vpshrd_v4si_mask ((__v4si)__B, (__v4si) __C, + __D, (__v4si) _mm_setzero_si128 (), (__mmask8)__A); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shrdi_epi32 (__m128i __A, __m128i __B, int __C) +{ + return (__m128i) __builtin_ia32_vpshrd_v4si ((__v4si)__A, (__v4si) __B, __C); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shrdi_epi64 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D, + int __E) +{ + return (__m128i)__builtin_ia32_vpshrd_v2di_mask ((__v2di)__C, (__v2di) __D, + __E, (__v2di) __A, (__mmask8)__B); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shrdi_epi64 (__mmask8 __A, __m128i __B, __m128i __C, int __D) +{ + return (__m128i)__builtin_ia32_vpshrd_v2di_mask ((__v2di)__B, (__v2di) __C, + __D, (__v2di) _mm_setzero_si128 (), (__mmask8)__A); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shrdi_epi64 (__m128i __A, __m128i __B, int __C) +{ + return (__m128i) __builtin_ia32_vpshrd_v2di ((__v2di)__A, (__v2di) __B, __C); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shldi_epi16 (__m256i __A, __m256i __B, int __C) +{ + return (__m256i) __builtin_ia32_vpshld_v16hi ((__v16hi)__A, (__v16hi) __B, + __C); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shldi_epi16 (__m256i __A, __mmask16 __B, __m256i __C, __m256i __D, + int __E) +{ + return (__m256i)__builtin_ia32_vpshld_v16hi_mask ((__v16hi)__C, + (__v16hi) __D, __E, (__v16hi) __A, (__mmask16)__B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shldi_epi16 (__mmask16 __A, __m256i __B, __m256i __C, int __D) +{ + return (__m256i)__builtin_ia32_vpshld_v16hi_mask ((__v16hi)__B, + (__v16hi) __C, __D, (__v16hi) _mm256_setzero_si256 (), (__mmask16)__A); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shldi_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D, + int __E) +{ + return (__m256i)__builtin_ia32_vpshld_v8si_mask ((__v8si)__C, (__v8si) __D, + __E, (__v8si) __A, (__mmask8)__B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shldi_epi32 (__mmask8 __A, __m256i __B, __m256i __C, int __D) +{ + return (__m256i)__builtin_ia32_vpshld_v8si_mask ((__v8si)__B, (__v8si) __C, + __D, (__v8si) _mm256_setzero_si256 (), (__mmask8)__A); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shldi_epi32 (__m256i __A, __m256i __B, int __C) +{ + return (__m256i) __builtin_ia32_vpshld_v8si ((__v8si)__A, (__v8si) __B, __C); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shldi_epi64 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D, + int __E) +{ + return (__m256i)__builtin_ia32_vpshld_v4di_mask ((__v4di)__C, (__v4di) __D, + __E, (__v4di) __A, (__mmask8)__B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shldi_epi64 (__mmask8 __A, __m256i __B, __m256i __C, int __D) +{ + return (__m256i)__builtin_ia32_vpshld_v4di_mask ((__v4di)__B, (__v4di) __C, + __D, (__v4di) _mm256_setzero_si256 (), (__mmask8)__A); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shldi_epi64 (__m256i __A, __m256i __B, int __C) +{ + return (__m256i) __builtin_ia32_vpshld_v4di ((__v4di)__A, (__v4di) __B, __C); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shldi_epi16 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D, + int __E) +{ + return (__m128i)__builtin_ia32_vpshld_v8hi_mask ((__v8hi)__C, (__v8hi) __D, + __E, (__v8hi) __A, (__mmask8)__B); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shldi_epi16 (__mmask8 __A, __m128i __B, __m128i __C, int __D) +{ + return (__m128i)__builtin_ia32_vpshld_v8hi_mask ((__v8hi)__B, (__v8hi) __C, + __D, (__v8hi) _mm_setzero_si128 (), (__mmask8)__A); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shldi_epi16 (__m128i __A, __m128i __B, int __C) +{ + return (__m128i) __builtin_ia32_vpshld_v8hi ((__v8hi)__A, (__v8hi) __B, __C); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shldi_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D, + int __E) +{ + return (__m128i)__builtin_ia32_vpshld_v4si_mask ((__v4si)__C, (__v4si) __D, + __E, (__v4si) __A, (__mmask8)__B); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shldi_epi32 (__mmask8 __A, __m128i __B, __m128i __C, int __D) +{ + return (__m128i)__builtin_ia32_vpshld_v4si_mask ((__v4si)__B, (__v4si) __C, + __D, (__v4si) _mm_setzero_si128 (), (__mmask8)__A); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shldi_epi32 (__m128i __A, __m128i __B, int __C) +{ + return (__m128i) __builtin_ia32_vpshld_v4si ((__v4si)__A, (__v4si) __B, __C); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shldi_epi64 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D, + int __E) +{ + return (__m128i)__builtin_ia32_vpshld_v2di_mask ((__v2di)__C, (__v2di) __D, + __E, (__v2di) __A, (__mmask8)__B); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shldi_epi64 (__mmask8 __A, __m128i __B, __m128i __C, int __D) +{ + return (__m128i)__builtin_ia32_vpshld_v2di_mask ((__v2di)__B, (__v2di) __C, + __D, (__v2di) _mm_setzero_si128 (), (__mmask8)__A); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shldi_epi64 (__m128i __A, __m128i __B, int __C) +{ + return (__m128i) __builtin_ia32_vpshld_v2di ((__v2di)__A, (__v2di) __B, __C); +} +#else +#define _mm256_shrdi_epi16(A, B, C) \ + ((__m256i) __builtin_ia32_vpshrd_v16hi ((__v16hi)(__m256i)(A), \ + (__v16hi)(__m256i)(B),(int)(C))) +#define _mm256_mask_shrdi_epi16(A, B, C, D, E) \ + ((__m256i) __builtin_ia32_vpshrd_v16hi_mask ((__v16hi)(__m256i)(C), \ + (__v16hi)(__m256i)(D), \ + (int)(E), \ + (__v16hi)(__m256i)(A), \ + (__mmask16)(B))) +#define _mm256_maskz_shrdi_epi16(A, B, C, D) \ + ((__m256i) \ + __builtin_ia32_vpshrd_v16hi_mask ((__v16hi)(__m256i)(B), \ + (__v16hi)(__m256i)(C),(int)(D), \ + (__v16hi)(__m256i)_mm256_setzero_si256 (), \ + (__mmask16)(A))) +#define _mm256_shrdi_epi32(A, B, C) \ + ((__m256i) __builtin_ia32_vpshrd_v8si ((__v8si)(__m256i)(A), \ + (__v8si)(__m256i)(B),(int)(C))) +#define _mm256_mask_shrdi_epi32(A, B, C, D, E) \ + ((__m256i) __builtin_ia32_vpshrd_v8si_mask ((__v8si)(__m256i)(C), \ + (__v8si)(__m256i)(D), \ + (int)(E), \ + (__v8si)(__m256i)(A), \ + (__mmask8)(B))) +#define _mm256_maskz_shrdi_epi32(A, B, C, D) \ + ((__m256i) \ + __builtin_ia32_vpshrd_v8si_mask ((__v8si)(__m256i)(B), \ + (__v8si)(__m256i)(C),(int)(D), \ + (__v8si)(__m256i)_mm256_setzero_si256 (), \ + (__mmask8)(A))) +#define _mm256_shrdi_epi64(A, B, C) \ + ((__m256i) __builtin_ia32_vpshrd_v4di ((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B),(int)(C))) +#define _mm256_mask_shrdi_epi64(A, B, C, D, E) \ + ((__m256i) __builtin_ia32_vpshrd_v4di_mask ((__v4di)(__m256i)(C), \ + (__v4di)(__m256i)(D), (int)(E), \ + (__v4di)(__m256i)(A), \ + (__mmask8)(B))) +#define _mm256_maskz_shrdi_epi64(A, B, C, D) \ + ((__m256i) \ + __builtin_ia32_vpshrd_v4di_mask ((__v4di)(__m256i)(B), \ + (__v4di)(__m256i)(C),(int)(D), \ + (__v4di)(__m256i)_mm256_setzero_si256 (), \ + (__mmask8)(A))) +#define _mm_shrdi_epi16(A, B, C) \ + ((__m128i) __builtin_ia32_vpshrd_v8hi ((__v8hi)(__m128i)(A), \ + (__v8hi)(__m128i)(B),(int)(C))) +#define _mm_mask_shrdi_epi16(A, B, C, D, E) \ + ((__m128i) __builtin_ia32_vpshrd_v8hi_mask ((__v8hi)(__m128i)(C), \ + (__v8hi)(__m128i)(D), (int)(E), \ + (__v8hi)(__m128i)(A), \ + (__mmask8)(B))) +#define _mm_maskz_shrdi_epi16(A, B, C, D) \ + ((__m128i) \ + __builtin_ia32_vpshrd_v8hi_mask ((__v8hi)(__m128i)(B), \ + (__v8hi)(__m128i)(C),(int)(D), \ + (__v8hi)(__m128i)_mm_setzero_si128 (), \ + (__mmask8)(A))) +#define _mm_shrdi_epi32(A, B, C) \ + ((__m128i) __builtin_ia32_vpshrd_v4si ((__v4si)(__m128i)(A), \ + (__v4si)(__m128i)(B),(int)(C))) +#define _mm_mask_shrdi_epi32(A, B, C, D, E) \ + ((__m128i) __builtin_ia32_vpshrd_v4si_mask ((__v4si)(__m128i)(C), \ + (__v4si)(__m128i)(D), (int)(E), \ + (__v4si)(__m128i)(A), \ + (__mmask8)(B))) +#define _mm_maskz_shrdi_epi32(A, B, C, D) \ + ((__m128i) \ + __builtin_ia32_vpshrd_v4si_mask ((__v4si)(__m128i)(B), \ + (__v4si)(__m128i)(C),(int)(D), \ + (__v4si)(__m128i)_mm_setzero_si128 (), \ + (__mmask8)(A))) +#define _mm_shrdi_epi64(A, B, C) \ + ((__m128i) __builtin_ia32_vpshrd_v2di ((__v2di)(__m128i)(A), \ + (__v2di)(__m128i)(B),(int)(C))) +#define _mm_mask_shrdi_epi64(A, B, C, D, E) \ + ((__m128i) __builtin_ia32_vpshrd_v2di_mask ((__v2di)(__m128i)(C), \ + (__v2di)(__m128i)(D), (int)(E), \ + (__v2di)(__m128i)(A), \ + (__mmask8)(B))) +#define _mm_maskz_shrdi_epi64(A, B, C, D) \ + ((__m128i) \ + __builtin_ia32_vpshrd_v2di_mask ((__v2di)(__m128i)(B), \ + (__v2di)(__m128i)(C),(int)(D), \ + (__v2di)(__m128i)_mm_setzero_si128 (), \ + (__mmask8)(A))) +#define _mm256_shldi_epi16(A, B, C) \ + ((__m256i) __builtin_ia32_vpshld_v16hi ((__v16hi)(__m256i)(A), \ + (__v16hi)(__m256i)(B),(int)(C))) +#define _mm256_mask_shldi_epi16(A, B, C, D, E) \ + ((__m256i) __builtin_ia32_vpshld_v16hi_mask ((__v16hi)(__m256i)(C), \ + (__v16hi)(__m256i)(D), \ + (int)(E), \ + (__v16hi)(__m256i)(A), \ + (__mmask16)(B))) +#define _mm256_maskz_shldi_epi16(A, B, C, D) \ + ((__m256i) \ + __builtin_ia32_vpshld_v16hi_mask ((__v16hi)(__m256i)(B), \ + (__v16hi)(__m256i)(C),(int)(D), \ + (__v16hi)(__m256i)_mm256_setzero_si256 (), \ + (__mmask16)(A))) +#define _mm256_shldi_epi32(A, B, C) \ + ((__m256i) __builtin_ia32_vpshld_v8si ((__v8si)(__m256i)(A), \ + (__v8si)(__m256i)(B),(int)(C))) +#define _mm256_mask_shldi_epi32(A, B, C, D, E) \ + ((__m256i) __builtin_ia32_vpshld_v8si_mask ((__v8si)(__m256i)(C), \ + (__v8si)(__m256i)(D), (int)(E), \ + (__v8si)(__m256i)(A), \ + (__mmask8)(B))) +#define _mm256_maskz_shldi_epi32(A, B, C, D) \ + ((__m256i) \ + __builtin_ia32_vpshld_v8si_mask ((__v8si)(__m256i)(B), \ + (__v8si)(__m256i)(C),(int)(D), \ + (__v8si)(__m256i)_mm256_setzero_si256 (), \ + (__mmask8)(A))) +#define _mm256_shldi_epi64(A, B, C) \ + ((__m256i) __builtin_ia32_vpshld_v4di ((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B),(int)(C))) +#define _mm256_mask_shldi_epi64(A, B, C, D, E) \ + ((__m256i) __builtin_ia32_vpshld_v4di_mask ((__v4di)(__m256i)(C), \ + (__v4di)(__m256i)(D), (int)(E), \ + (__v4di)(__m256i)(A), \ + (__mmask8)(B))) +#define _mm256_maskz_shldi_epi64(A, B, C, D) \ + ((__m256i) \ + __builtin_ia32_vpshld_v4di_mask ((__v4di)(__m256i)(B), \ + (__v4di)(__m256i)(C),(int)(D), \ + (__v4di)(__m256i)_mm256_setzero_si256 (), \ + (__mmask8)(A))) +#define _mm_shldi_epi16(A, B, C) \ + ((__m128i) __builtin_ia32_vpshld_v8hi ((__v8hi)(__m128i)(A), \ + (__v8hi)(__m128i)(B),(int)(C))) +#define _mm_mask_shldi_epi16(A, B, C, D, E) \ + ((__m128i) __builtin_ia32_vpshld_v8hi_mask ((__v8hi)(__m128i)(C), \ + (__v8hi)(__m128i)(D), (int)(E), \ + (__v8hi)(__m128i)(A), \ + (__mmask8)(B))) +#define _mm_maskz_shldi_epi16(A, B, C, D) \ + ((__m128i) \ + __builtin_ia32_vpshld_v8hi_mask ((__v8hi)(__m128i)(B), \ + (__v8hi)(__m128i)(C),(int)(D), \ + (__v8hi)(__m128i)_mm_setzero_si128 (), \ + (__mmask8)(A))) +#define _mm_shldi_epi32(A, B, C) \ + ((__m128i) __builtin_ia32_vpshld_v4si ((__v4si)(__m128i)(A), \ + (__v4si)(__m128i)(B),(int)(C))) +#define _mm_mask_shldi_epi32(A, B, C, D, E) \ + ((__m128i) __builtin_ia32_vpshld_v4si_mask ((__v4si)(__m128i)(C), \ + (__v4si)(__m128i)(D), (int)(E), \ + (__v4si)(__m128i)(A), \ + (__mmask8)(B))) +#define _mm_maskz_shldi_epi32(A, B, C, D) \ + ((__m128i) \ + __builtin_ia32_vpshld_v4si_mask ((__v4si)(__m128i)(B), \ + (__v4si)(__m128i)(C),(int)(D), \ + (__v4si)(__m128i)_mm_setzero_si128 (), \ + (__mmask8)(A))) +#define _mm_shldi_epi64(A, B, C) \ + ((__m128i) __builtin_ia32_vpshld_v2di ((__v2di)(__m128i)(A), \ + (__v2di)(__m128i)(B),(int)(C))) +#define _mm_mask_shldi_epi64(A, B, C, D, E) \ + ((__m128i) __builtin_ia32_vpshld_v2di_mask ((__v2di)(__m128i)(C), \ + (__v2di)(__m128i)(D), (int)(E), \ + (__v2di)(__m128i)(A), \ + (__mmask8)(B))) +#define _mm_maskz_shldi_epi64(A, B, C, D) \ + ((__m128i) \ + __builtin_ia32_vpshld_v2di_mask ((__v2di)(__m128i)(B), \ + (__v2di)(__m128i)(C),(int)(D), \ + (__v2di)(__m128i)_mm_setzero_si128 (), \ + (__mmask8)(A))) +#endif + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shrdv_epi16 (__m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i) __builtin_ia32_vpshrdv_v16hi ((__v16hi)__A, (__v16hi) __B, + (__v16hi) __C); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shrdv_epi16 (__m256i __A, __mmask16 __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpshrdv_v16hi_mask ((__v16hi)__A, + (__v16hi) __C, (__v16hi) __D, (__mmask16)__B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shrdv_epi16 (__mmask16 __A, __m256i __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpshrdv_v16hi_maskz ((__v16hi)__B, + (__v16hi) __C, (__v16hi) __D, (__mmask16)__A); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shrdv_epi32 (__m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i) __builtin_ia32_vpshrdv_v8si ((__v8si)__A, (__v8si) __B, + (__v8si) __C); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shrdv_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpshrdv_v8si_mask ((__v8si)__A, (__v8si) __C, + (__v8si) __D, (__mmask8)__B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shrdv_epi32 (__mmask8 __A, __m256i __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpshrdv_v8si_maskz ((__v8si)__B, (__v8si) __C, + (__v8si) __D, (__mmask8)__A); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shrdv_epi64 (__m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i) __builtin_ia32_vpshrdv_v4di ((__v4di)__A, (__v4di) __B, + (__v4di) __C); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shrdv_epi64 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpshrdv_v4di_mask ((__v4di)__A, (__v4di) __C, + (__v4di) __D, (__mmask8)__B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shrdv_epi64 (__mmask8 __A, __m256i __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpshrdv_v4di_maskz ((__v4di)__B, (__v4di) __C, + (__v4di) __D, (__mmask8)__A); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shrdv_epi16 (__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpshrdv_v8hi ((__v8hi)__A, (__v8hi) __B, + (__v8hi) __C); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shrdv_epi16 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpshrdv_v8hi_mask ((__v8hi)__A, (__v8hi) __C, + (__v8hi) __D, (__mmask8)__B); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shrdv_epi16 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpshrdv_v8hi_maskz ((__v8hi)__B, (__v8hi) __C, + (__v8hi) __D, (__mmask8)__A); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shrdv_epi32 (__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpshrdv_v4si ((__v4si)__A, (__v4si) __B, + (__v4si) __C); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shrdv_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpshrdv_v4si_mask ((__v4si)__A, (__v4si) __C, + (__v4si) __D, (__mmask8)__B); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shrdv_epi32 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpshrdv_v4si_maskz ((__v4si)__B, (__v4si) __C, + (__v4si) __D, (__mmask8)__A); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shrdv_epi64 (__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpshrdv_v2di ((__v2di)__A, (__v2di) __B, + (__v2di) __C); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shrdv_epi64 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpshrdv_v2di_mask ((__v2di)__A, (__v2di) __C, + (__v2di) __D, (__mmask8)__B); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shrdv_epi64 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpshrdv_v2di_maskz ((__v2di)__B, (__v2di) __C, + (__v2di) __D, (__mmask8)__A); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shldv_epi16 (__m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i) __builtin_ia32_vpshldv_v16hi ((__v16hi)__A, (__v16hi) __B, + (__v16hi) __C); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shldv_epi16 (__m256i __A, __mmask16 __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpshldv_v16hi_mask ((__v16hi)__A, + (__v16hi) __C, (__v16hi) __D, (__mmask16)__B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shldv_epi16 (__mmask16 __A, __m256i __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpshldv_v16hi_maskz ((__v16hi)__B, + (__v16hi) __C, (__v16hi) __D, (__mmask16)__A); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shldv_epi32 (__m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i) __builtin_ia32_vpshldv_v8si ((__v8si)__A, (__v8si) __B, + (__v8si) __C); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shldv_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpshldv_v8si_mask ((__v8si)__A, (__v8si) __C, + (__v8si) __D, (__mmask8)__B) ; +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shldv_epi32 (__mmask8 __A, __m256i __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpshldv_v8si_maskz ((__v8si)__B, (__v8si) __C, + (__v8si) __D, (__mmask8)__A); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shldv_epi64 (__m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i) __builtin_ia32_vpshldv_v4di ((__v4di)__A, (__v4di) __B, + (__v4di) __C); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shldv_epi64 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpshldv_v4di_mask ((__v4di)__A, (__v4di) __C, + (__v4di) __D, (__mmask8)__B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shldv_epi64 (__mmask8 __A, __m256i __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpshldv_v4di_maskz ((__v4di)__B, (__v4di) __C, + (__v4di) __D, (__mmask8)__A); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shldv_epi16 (__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpshldv_v8hi ((__v8hi)__A, (__v8hi) __B, + (__v8hi) __C); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shldv_epi16 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpshldv_v8hi_mask ((__v8hi)__A, (__v8hi) __C, + (__v8hi) __D, (__mmask8)__B); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shldv_epi16 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpshldv_v8hi_maskz ((__v8hi)__B, (__v8hi) __C, + (__v8hi) __D, (__mmask8)__A); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shldv_epi32 (__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpshldv_v4si ((__v4si)__A, (__v4si) __B, + (__v4si) __C); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shldv_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpshldv_v4si_mask ((__v4si)__A, (__v4si) __C, + (__v4si) __D, (__mmask8)__B); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shldv_epi32 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpshldv_v4si_maskz ((__v4si)__B, (__v4si) __C, + (__v4si) __D, (__mmask8)__A); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shldv_epi64 (__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpshldv_v2di ((__v2di)__A, (__v2di) __B, + (__v2di) __C); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shldv_epi64 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpshldv_v2di_mask ((__v2di)__A, (__v2di) __C, + (__v2di) __D, (__mmask8)__B); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shldv_epi64 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpshldv_v2di_maskz ((__v2di)__B, (__v2di) __C, + (__v2di) __D, (__mmask8)__A); +} + + + + +#ifdef __DISABLE_AVX512VBMI2VL__ +#undef __DISABLE_AVX512VBMI2VL__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512VBMIVL__ */ + +#if !defined(__AVX512VL__) || !defined(__AVX512VBMI2__) || \ + !defined(__AVX512BW__) +#pragma GCC push_options +#pragma GCC target("avx512vbmi2,avx512vl,avx512bw") +#define __DISABLE_AVX512VBMI2VLBW__ +#endif /* __AVX512VBMIVLBW__ */ + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_compress_epi8 (__m256i __A, __mmask32 __B, __m256i __C) +{ + return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi)__C, + (__v32qi)__A, (__mmask32)__B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_compress_epi8 (__mmask32 __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __B, + (__v32qi) _mm256_setzero_si256 (), (__mmask32) __A); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_compressstoreu_epi8 (void * __A, __mmask32 __B, __m256i __C) +{ + __builtin_ia32_compressstoreuqi256_mask ((__v32qi *) __A, (__v32qi) __C, + (__mmask32) __B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_expand_epi8 (__m256i __A, __mmask32 __B, __m256i __C) +{ + return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __C, + (__v32qi) __A, + (__mmask32) __B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_expand_epi8 (__mmask32 __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_expandqi256_maskz ((__v32qi) __B, + (__v32qi) _mm256_setzero_si256 (), (__mmask32) __A); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_expandloadu_epi8 (__m256i __A, __mmask32 __B, const void * __C) +{ + return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *) __C, + (__v32qi) __A, (__mmask32) __B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_expandloadu_epi8 (__mmask32 __A, const void * __B) +{ + return (__m256i) __builtin_ia32_expandloadqi256_maskz ((const __v32qi *) __B, + (__v32qi) _mm256_setzero_si256 (), (__mmask32) __A); +} + +#ifdef __DISABLE_AVX512VBMI2VLBW__ +#undef __DISABLE_AVX512VBMI2VLBW__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512VBMIVLBW__ */ + +#endif /* _AVX512VBMIVLINTRIN_H_INCLUDED */ diff --git a/include-gcc/avx512vbmiintrin.h b/include-gcc/avx512vbmiintrin.h new file mode 100644 index 0000000..5025860 --- /dev/null +++ b/include-gcc/avx512vbmiintrin.h @@ -0,0 +1,158 @@ +/* Copyright (C) 2013-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512VBMIINTRIN_H_INCLUDED +#define _AVX512VBMIINTRIN_H_INCLUDED + +#ifndef __AVX512VBMI__ +#pragma GCC push_options +#pragma GCC target("avx512vbmi") +#define __DISABLE_AVX512VBMI__ +#endif /* __AVX512VBMI__ */ + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_multishift_epi64_epi8 (__m512i __W, __mmask64 __M, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X, + (__v64qi) __Y, + (__v64qi) __W, + (__mmask64) __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_multishift_epi64_epi8 (__mmask64 __M, __m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X, + (__v64qi) __Y, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_multishift_epi64_epi8 (__m512i __X, __m512i __Y) +{ + return (__m512i) __builtin_ia32_vpmultishiftqb512_mask ((__v64qi) __X, + (__v64qi) __Y, + (__v64qi) + _mm512_undefined_epi32 (), + (__mmask64) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutexvar_epi8 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B, + (__v64qi) __A, + (__v64qi) + _mm512_undefined_epi32 (), + (__mmask64) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B, + (__v64qi) __A, + (__v64qi) + _mm512_setzero_si512(), + (__mmask64) __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A, + __m512i __B) +{ + return (__m512i) __builtin_ia32_permvarqi512_mask ((__v64qi) __B, + (__v64qi) __A, + (__v64qi) __W, + (__mmask64) __M); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_permutex2var_epi8 (__m512i __A, __m512i __I, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermt2varqi512_mask ((__v64qi) __I + /* idx */ , + (__v64qi) __A, + (__v64qi) __B, + (__mmask64) -1); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_permutex2var_epi8 (__m512i __A, __mmask64 __U, + __m512i __I, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermt2varqi512_mask ((__v64qi) __I + /* idx */ , + (__v64qi) __A, + (__v64qi) __B, + (__mmask64) + __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask2_permutex2var_epi8 (__m512i __A, __m512i __I, + __mmask64 __U, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermi2varqi512_mask ((__v64qi) __A, + (__v64qi) __I + /* idx */ , + (__v64qi) __B, + (__mmask64) + __U); +} + +extern __inline __m512i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_permutex2var_epi8 (__mmask64 __U, __m512i __A, + __m512i __I, __m512i __B) +{ + return (__m512i) __builtin_ia32_vpermt2varqi512_maskz ((__v64qi) __I + /* idx */ , + (__v64qi) __A, + (__v64qi) __B, + (__mmask64) + __U); +} + +#ifdef __DISABLE_AVX512VBMI__ +#undef __DISABLE_AVX512VBMI__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512VBMI__ */ + +#endif /* _AVX512VBMIINTRIN_H_INCLUDED */ diff --git a/include-gcc/avx512vbmivlintrin.h b/include-gcc/avx512vbmivlintrin.h new file mode 100644 index 0000000..035408f --- /dev/null +++ b/include-gcc/avx512vbmivlintrin.h @@ -0,0 +1,273 @@ +/* Copyright (C) 2013-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512VBMIVLINTRIN_H_INCLUDED +#define _AVX512VBMIVLINTRIN_H_INCLUDED + +#if !defined(__AVX512VL__) || !defined(__AVX512VBMI__) +#pragma GCC push_options +#pragma GCC target("avx512vbmi,avx512vl") +#define __DISABLE_AVX512VBMIVL__ +#endif /* __AVX512VBMIVL__ */ + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_multishift_epi64_epi8 (__m256i __W, __mmask32 __M, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X, + (__v32qi) __Y, + (__v32qi) __W, + (__mmask32) __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_multishift_epi64_epi8 (__mmask32 __M, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X, + (__v32qi) __Y, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_multishift_epi64_epi8 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_vpmultishiftqb256_mask ((__v32qi) __X, + (__v32qi) __Y, + (__v32qi) + _mm256_undefined_si256 (), + (__mmask32) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_multishift_epi64_epi8 (__m128i __W, __mmask16 __M, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X, + (__v16qi) __Y, + (__v16qi) __W, + (__mmask16) __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_multishift_epi64_epi8 (__mmask16 __M, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X, + (__v16qi) __Y, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_multishift_epi64_epi8 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_vpmultishiftqb128_mask ((__v16qi) __X, + (__v16qi) __Y, + (__v16qi) + _mm_undefined_si128 (), + (__mmask16) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutexvar_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B, + (__v32qi) __A, + (__v32qi) + _mm256_undefined_si256 (), + (__mmask32) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B, + (__v32qi) __A, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_permvarqi256_mask ((__v32qi) __B, + (__v32qi) __A, + (__v32qi) __W, + (__mmask32) __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permutexvar_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B, + (__v16qi) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask16) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B, + (__v16qi) __A, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_permvarqi128_mask ((__v16qi) __B, + (__v16qi) __A, + (__v16qi) __W, + (__mmask16) __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutex2var_epi8 (__m256i __A, __m256i __I, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermt2varqi256_mask ((__v32qi) __I + /* idx */ , + (__v32qi) __A, + (__v32qi) __B, + (__mmask32) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutex2var_epi8 (__m256i __A, __mmask32 __U, + __m256i __I, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermt2varqi256_mask ((__v32qi) __I + /* idx */ , + (__v32qi) __A, + (__v32qi) __B, + (__mmask32) + __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask2_permutex2var_epi8 (__m256i __A, __m256i __I, + __mmask32 __U, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermi2varqi256_mask ((__v32qi) __A, + (__v32qi) __I + /* idx */ , + (__v32qi) __B, + (__mmask32) + __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutex2var_epi8 (__mmask32 __U, __m256i __A, + __m256i __I, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermt2varqi256_maskz ((__v32qi) __I + /* idx */ , + (__v32qi) __A, + (__v32qi) __B, + (__mmask32) + __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permutex2var_epi8 (__m128i __A, __m128i __I, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermt2varqi128_mask ((__v16qi) __I + /* idx */ , + (__v16qi) __A, + (__v16qi) __B, + (__mmask16) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_permutex2var_epi8 (__m128i __A, __mmask16 __U, __m128i __I, + __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermt2varqi128_mask ((__v16qi) __I + /* idx */ , + (__v16qi) __A, + (__v16qi) __B, + (__mmask16) + __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask2_permutex2var_epi8 (__m128i __A, __m128i __I, __mmask16 __U, + __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermi2varqi128_mask ((__v16qi) __A, + (__v16qi) __I + /* idx */ , + (__v16qi) __B, + (__mmask16) + __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_permutex2var_epi8 (__mmask16 __U, __m128i __A, __m128i __I, + __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermt2varqi128_maskz ((__v16qi) __I + /* idx */ , + (__v16qi) __A, + (__v16qi) __B, + (__mmask16) + __U); +} + +#ifdef __DISABLE_AVX512VBMIVL__ +#undef __DISABLE_AVX512VBMIVL__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512VBMIVL__ */ + +#endif /* _AVX512VBMIVLINTRIN_H_INCLUDED */ diff --git a/include-gcc/avx512vlbwintrin.h b/include-gcc/avx512vlbwintrin.h new file mode 100644 index 0000000..0232783 --- /dev/null +++ b/include-gcc/avx512vlbwintrin.h @@ -0,0 +1,4758 @@ +/* Copyright (C) 2014-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512VLBWINTRIN_H_INCLUDED +#define _AVX512VLBWINTRIN_H_INCLUDED + +#if !defined(__AVX512VL__) || !defined(__AVX512BW__) +#pragma GCC push_options +#pragma GCC target("avx512vl,avx512bw") +#define __DISABLE_AVX512VLBW__ +#endif /* __AVX512VLBW__ */ + +/* Internal data types for implementing the intrinsics. */ +typedef short __v16hi_u __attribute__ ((__vector_size__ (32), \ + __may_alias__, __aligned__ (1))); +typedef short __v8hi_u __attribute__ ((__vector_size__ (16), \ + __may_alias__, __aligned__ (1))); +typedef char __v32qi_u __attribute__ ((__vector_size__ (32), \ + __may_alias__, __aligned__ (1))); +typedef char __v16qi_u __attribute__ ((__vector_size__ (16), \ + __may_alias__, __aligned__ (1))); + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mov_epi8 (__m256i __W, __mmask32 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_movdquqi256_mask ((__v32qi) __A, + (__v32qi) __W, + (__mmask32) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mov_epi8 (__mmask32 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_movdquqi256_mask ((__v32qi) __A, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mov_epi8 (__m128i __W, __mmask16 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_movdquqi128_mask ((__v16qi) __A, + (__v16qi) __W, + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mov_epi8 (__mmask16 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_movdquqi128_mask ((__v16qi) __A, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_storeu_epi8 (void *__P, __m256i __A) +{ + *(__v32qi_u *) __P = (__v32qi_u) __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_storeu_epi8 (void *__P, __mmask32 __U, __m256i __A) +{ + __builtin_ia32_storedquqi256_mask ((char *) __P, + (__v32qi) __A, + (__mmask32) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_epi8 (void *__P, __m128i __A) +{ + *(__v16qi_u *) __P = (__v16qi_u) __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_storeu_epi8 (void *__P, __mmask16 __U, __m128i __A) +{ + __builtin_ia32_storedquqi128_mask ((char *) __P, + (__v16qi) __A, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu_epi16 (void const *__P) +{ + return (__m256i) (*(__v16hi_u *) __P); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_loadu_epi16 (__m256i __W, __mmask16 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_loaddquhi256_mask ((const short *) __P, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_loadu_epi16 (__mmask16 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_loaddquhi256_mask ((const short *) __P, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_epi16 (void const *__P) +{ + return (__m128i) (*(__v8hi_u *) __P); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_loadu_epi16 (__m128i __W, __mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_loaddquhi128_mask ((const short *) __P, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_loadu_epi16 (__mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_loaddquhi128_mask ((const short *) __P, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mov_epi16 (__m256i __W, __mmask16 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_movdquhi256_mask ((__v16hi) __A, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mov_epi16 (__mmask16 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_movdquhi256_mask ((__v16hi) __A, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mov_epi16 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_movdquhi128_mask ((__v8hi) __A, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mov_epi16 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_movdquhi128_mask ((__v8hi) __A, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu_epi8 (void const *__P) +{ + return (__m256i) (*(__v32qi_u *) __P); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_loadu_epi8 (__m256i __W, __mmask32 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_loaddquqi256_mask ((const char *) __P, + (__v32qi) __W, + (__mmask32) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_loadu_epi8 (__mmask32 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_loaddquqi256_mask ((const char *) __P, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_epi8 (void const *__P) +{ + return (__m128i) (*(__v16qi_u *) __P); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_loadu_epi8 (__m128i __W, __mmask16 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_loaddquqi128_mask ((const char *) __P, + (__v16qi) __W, + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_loadu_epi8 (__mmask16 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_loaddquqi128_mask ((const char *) __P, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi16_epi8 (__m256i __A) +{ + + return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A, + (__v16qi)_mm_undefined_si128(), + (__mmask16) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi16_storeu_epi8 (void * __P, __mmask16 __M,__m256i __A) +{ + __builtin_ia32_pmovwb256mem_mask ((__v16qi *) __P , (__v16hi) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A, + (__v16qi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi16_epi8 (__mmask16 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovwb256_mask ((__v16hi) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsepi16_epi8 (__m128i __A) +{ + + return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A, + (__v16qi)_mm_undefined_si128(), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask8 __M,__m128i __A) +{ + __builtin_ia32_pmovswb128mem_mask ((unsigned long long *) __P , (__v8hi) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtsepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A, + (__v16qi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtsepi16_epi8 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtsepi16_epi8 (__m256i __A) +{ + + return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A, + (__v16qi)_mm_undefined_si128(), + (__mmask16) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask16 __M,__m256i __A) +{ + __builtin_ia32_pmovswb256mem_mask ((__v16qi *) __P , (__v16hi) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtsepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A, + (__v16qi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtsepi16_epi8 (__mmask16 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtusepi16_epi8 (__m128i __A) +{ + + return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A, + (__v16qi)_mm_undefined_si128(), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask8 __M,__m128i __A) +{ + __builtin_ia32_pmovuswb128mem_mask ((unsigned long long *) __P , (__v8hi) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtusepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A, + (__v16qi) __O, + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtusepi16_epi8 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtusepi16_epi8 (__m256i __A) +{ + + return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A, + (__v16qi)_mm_undefined_si128(), + (__mmask16) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask16 __M,__m256i __A) +{ + __builtin_ia32_pmovuswb256mem_mask ((__v16qi *) __P , (__v16hi) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtusepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A, + (__v16qi) __O, + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtusepi16_epi8 (__mmask16 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_broadcastb_epi8 (__m256i __O, __mmask32 __M, __m128i __A) +{ + return (__m256i) __builtin_ia32_pbroadcastb256_mask ((__v16qi) __A, + (__v32qi) __O, + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_broadcastb_epi8 (__mmask32 __M, __m128i __A) +{ + return (__m256i) __builtin_ia32_pbroadcastb256_mask ((__v16qi) __A, + (__v32qi) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_set1_epi8 (__m256i __O, __mmask32 __M, char __A) +{ + return (__m256i) __builtin_ia32_pbroadcastb256_gpr_mask (__A, + (__v32qi) __O, + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_set1_epi8 (__mmask32 __M, char __A) +{ + return (__m256i) __builtin_ia32_pbroadcastb256_gpr_mask (__A, + (__v32qi) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_broadcastb_epi8 (__m128i __O, __mmask16 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pbroadcastb128_mask ((__v16qi) __A, + (__v16qi) __O, + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_broadcastb_epi8 (__mmask16 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pbroadcastb128_mask ((__v16qi) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A) +{ + return (__m128i) __builtin_ia32_pbroadcastb128_gpr_mask (__A, + (__v16qi) __O, + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_set1_epi8 (__mmask16 __M, char __A) +{ + return (__m128i) __builtin_ia32_pbroadcastb128_gpr_mask (__A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_broadcastw_epi16 (__m256i __O, __mmask16 __M, __m128i __A) +{ + return (__m256i) __builtin_ia32_pbroadcastw256_mask ((__v8hi) __A, + (__v16hi) __O, + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_broadcastw_epi16 (__mmask16 __M, __m128i __A) +{ + return (__m256i) __builtin_ia32_pbroadcastw256_mask ((__v8hi) __A, + (__v16hi) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_set1_epi16 (__m256i __O, __mmask16 __M, short __A) +{ + return (__m256i) __builtin_ia32_pbroadcastw256_gpr_mask (__A, + (__v16hi) __O, + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_set1_epi16 (__mmask16 __M, short __A) +{ + return (__m256i) __builtin_ia32_pbroadcastw256_gpr_mask (__A, + (__v16hi) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_broadcastw_epi16 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pbroadcastw128_mask ((__v8hi) __A, + (__v8hi) __O, + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_broadcastw_epi16 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pbroadcastw128_mask ((__v8hi) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_set1_epi16 (__m128i __O, __mmask8 __M, short __A) +{ + return (__m128i) __builtin_ia32_pbroadcastw128_gpr_mask (__A, + (__v8hi) __O, + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_set1_epi16 (__mmask8 __M, short __A) +{ + return (__m128i) __builtin_ia32_pbroadcastw128_gpr_mask (__A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutexvar_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B, + (__v16hi) __A, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutexvar_epi16 (__mmask16 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B, + (__v16hi) __A, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_permvarhi256_mask ((__v16hi) __B, + (__v16hi) __A, + (__v16hi) __W, + (__mmask16) __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permutexvar_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B, + (__v8hi) __A, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_permutexvar_epi16 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B, + (__v8hi) __A, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_permutexvar_epi16 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_permvarhi128_mask ((__v8hi) __B, + (__v8hi) __A, + (__v8hi) __W, + (__mmask8) __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutex2var_epi16 (__m256i __A, __m256i __I, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermt2varhi256_mask ((__v16hi) __I + /* idx */ , + (__v16hi) __A, + (__v16hi) __B, + (__mmask16) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutex2var_epi16 (__m256i __A, __mmask16 __U, + __m256i __I, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermt2varhi256_mask ((__v16hi) __I + /* idx */ , + (__v16hi) __A, + (__v16hi) __B, + (__mmask16) + __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask2_permutex2var_epi16 (__m256i __A, __m256i __I, + __mmask16 __U, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermi2varhi256_mask ((__v16hi) __A, + (__v16hi) __I + /* idx */ , + (__v16hi) __B, + (__mmask16) + __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A, + __m256i __I, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermt2varhi256_maskz ((__v16hi) __I + /* idx */ , + (__v16hi) __A, + (__v16hi) __B, + (__mmask16) + __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permutex2var_epi16 (__m128i __A, __m128i __I, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermt2varhi128_mask ((__v8hi) __I + /* idx */ , + (__v8hi) __A, + (__v8hi) __B, + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_permutex2var_epi16 (__m128i __A, __mmask8 __U, __m128i __I, + __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermt2varhi128_mask ((__v8hi) __I + /* idx */ , + (__v8hi) __A, + (__v8hi) __B, + (__mmask8) + __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask2_permutex2var_epi16 (__m128i __A, __m128i __I, __mmask8 __U, + __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermi2varhi128_mask ((__v8hi) __A, + (__v8hi) __I + /* idx */ , + (__v8hi) __B, + (__mmask8) + __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_permutex2var_epi16 (__mmask8 __U, __m128i __A, __m128i __I, + __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermt2varhi128_maskz ((__v8hi) __I + /* idx */ , + (__v8hi) __A, + (__v8hi) __B, + (__mmask8) + __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_maddubs_epi16 (__m256i __W, __mmask16 __U, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmaddubsw256_mask ((__v32qi) __X, + (__v32qi) __Y, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_maddubs_epi16 (__mmask16 __U, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmaddubsw256_mask ((__v32qi) __X, + (__v32qi) __Y, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_maddubs_epi16 (__m128i __W, __mmask8 __U, __m128i __X, + __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmaddubsw128_mask ((__v16qi) __X, + (__v16qi) __Y, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_maddubs_epi16 (__mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmaddubsw128_mask ((__v16qi) __X, + (__v16qi) __Y, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_madd_epi16 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaddwd256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_madd_epi16 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaddwd256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_madd_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaddwd128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_madd_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaddwd128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movepi8_mask (__m128i __A) +{ + return (__mmask16) __builtin_ia32_cvtb2mask128 ((__v16qi) __A); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_movepi8_mask (__m256i __A) +{ + return (__mmask32) __builtin_ia32_cvtb2mask256 ((__v32qi) __A); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movepi16_mask (__m128i __A) +{ + return (__mmask8) __builtin_ia32_cvtw2mask128 ((__v8hi) __A); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_movepi16_mask (__m256i __A) +{ + return (__mmask16) __builtin_ia32_cvtw2mask256 ((__v16hi) __A); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movm_epi8 (__mmask16 __A) +{ + return (__m128i) __builtin_ia32_cvtmask2b128 (__A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_movm_epi8 (__mmask32 __A) +{ + return (__m256i) __builtin_ia32_cvtmask2b256 (__A); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movm_epi16 (__mmask8 __A) +{ + return (__m128i) __builtin_ia32_cvtmask2w128 (__A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_movm_epi16 (__mmask16 __A) +{ + return (__m256i) __builtin_ia32_cvtmask2w256 (__A); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_test_epi8_mask (__m128i __A, __m128i __B) +{ + return (__mmask16) __builtin_ia32_ptestmb128 ((__v16qi) __A, + (__v16qi) __B, + (__mmask16) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_test_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__mmask16) __builtin_ia32_ptestmb128 ((__v16qi) __A, + (__v16qi) __B, __U); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_test_epi8_mask (__m256i __A, __m256i __B) +{ + return (__mmask32) __builtin_ia32_ptestmb256 ((__v32qi) __A, + (__v32qi) __B, + (__mmask32) -1); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_test_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__mmask32) __builtin_ia32_ptestmb256 ((__v32qi) __A, + (__v32qi) __B, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_test_epi16_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ptestmw128 ((__v8hi) __A, + (__v8hi) __B, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_test_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ptestmw128 ((__v8hi) __A, + (__v8hi) __B, __U); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_test_epi16_mask (__m256i __A, __m256i __B) +{ + return (__mmask16) __builtin_ia32_ptestmw256 ((__v16hi) __A, + (__v16hi) __B, + (__mmask16) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_test_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__mmask16) __builtin_ia32_ptestmw256 ((__v16hi) __A, + (__v16hi) __B, __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_min_epu16 (__mmask16 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pminuw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_min_epu16 (__m256i __W, __mmask16 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pminuw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_epu16 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pminuw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_epu16 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pminuw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_min_epi16 (__mmask16 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pminsw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_min_epi16 (__m256i __W, __mmask16 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pminsw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_max_epu8 (__mmask32 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxub256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_max_epu8 (__m256i __W, __mmask32 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxub256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) __W, + (__mmask32) __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_epu8 (__mmask16 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxub128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_epu8 (__m128i __W, __mmask16 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxub128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) __W, + (__mmask16) __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_max_epi8 (__mmask32 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxsb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_max_epi8 (__m256i __W, __mmask32 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxsb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) __W, + (__mmask32) __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_epi8 (__mmask16 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxsb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_epi8 (__m128i __W, __mmask16 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxsb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) __W, + (__mmask16) __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_min_epu8 (__mmask32 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pminub256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_min_epu8 (__m256i __W, __mmask32 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pminub256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) __W, + (__mmask32) __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_epu8 (__mmask16 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pminub128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_epu8 (__m128i __W, __mmask16 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pminub128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) __W, + (__mmask16) __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_min_epi8 (__mmask32 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pminsb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_min_epi8 (__m256i __W, __mmask32 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pminsb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) __W, + (__mmask32) __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_epi8 (__mmask16 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pminsb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_epi8 (__m128i __W, __mmask16 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pminsb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) __W, + (__mmask16) __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_max_epi16 (__mmask16 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxsw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_max_epi16 (__m256i __W, __mmask16 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxsw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_epi16 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxsw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_epi16 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxsw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_max_epu16 (__mmask16 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxuw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_max_epu16 (__m256i __W, __mmask16 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxuw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_epu16 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxuw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_epu16 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxuw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_epi16 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pminsw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_epi16 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pminsw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __M); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_alignr_epi8 (__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B, const int __N) +{ + return (__m256i) __builtin_ia32_palignr256_mask ((__v4di) __A, + (__v4di) __B, + __N * 8, + (__v4di) __W, + (__mmask32) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_alignr_epi8 (__mmask32 __U, __m256i __A, __m256i __B, + const int __N) +{ + return (__m256i) __builtin_ia32_palignr256_mask ((__v4di) __A, + (__v4di) __B, + __N * 8, + (__v4di) + _mm256_setzero_si256 (), + (__mmask32) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_alignr_epi8 (__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B, const int __N) +{ + return (__m128i) __builtin_ia32_palignr128_mask ((__v2di) __A, + (__v2di) __B, + __N * 8, + (__v2di) __W, + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_alignr_epi8 (__mmask16 __U, __m128i __A, __m128i __B, + const int __N) +{ + return (__m128i) __builtin_ia32_palignr128_mask ((__v2di) __A, + (__v2di) __B, + __N * 8, + (__v2di) + _mm_setzero_si128 (), + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_dbsad_epu8 (__m256i __A, __m256i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi) __A, + (__v32qi) __B, + __imm, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_dbsad_epu8 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi) __A, + (__v32qi) __B, + __imm, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_dbsad_epu8 (__mmask16 __U, __m256i __A, __m256i __B, + const int __imm) +{ + return (__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi) __A, + (__v32qi) __B, + __imm, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_dbsad_epu8 (__m128i __A, __m128i __B, const int __imm) +{ + return (__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi) __A, + (__v16qi) __B, + __imm, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_dbsad_epu8 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B, const int __imm) +{ + return (__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi) __A, + (__v16qi) __B, + __imm, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_dbsad_epu8 (__mmask8 __U, __m128i __A, __m128i __B, + const int __imm) +{ + return (__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi) __A, + (__v16qi) __B, + __imm, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_blend_epi16 (__mmask8 __U, __m128i __A, __m128i __W) +{ + return (__m128i) __builtin_ia32_blendmw_128_mask ((__v8hi) __A, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_blend_epi8 (__mmask16 __U, __m128i __A, __m128i __W) +{ + return (__m128i) __builtin_ia32_blendmb_128_mask ((__v16qi) __A, + (__v16qi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_blend_epi16 (__mmask16 __U, __m256i __A, __m256i __W) +{ + return (__m256i) __builtin_ia32_blendmw_256_mask ((__v16hi) __A, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_blend_epi8 (__mmask32 __U, __m256i __A, __m256i __W) +{ + return (__m256i) __builtin_ia32_blendmb_256_mask ((__v32qi) __A, + (__v32qi) __W, + (__mmask32) __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_epi16_mask (__mmask8 __U, __m128i __X, __m128i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, __P, + (__mmask8) __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_epi16_mask (__m128i __X, __m128i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, __P, + (__mmask8) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmp_epi16_mask (__mmask16 __U, __m256i __X, __m256i __Y, + const int __P) +{ + return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, __P, + (__mmask16) __U); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_epi16_mask (__m256i __X, __m256i __Y, const int __P) +{ + return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, __P, + (__mmask16) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_epi8_mask (__mmask16 __U, __m128i __X, __m128i __Y, + const int __P) +{ + return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, __P, + (__mmask16) __U); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_epi8_mask (__m128i __X, __m128i __Y, const int __P) +{ + return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, __P, + (__mmask16) -1); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmp_epi8_mask (__mmask32 __U, __m256i __X, __m256i __Y, + const int __P) +{ + return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, __P, + (__mmask32) __U); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_epi8_mask (__m256i __X, __m256i __Y, const int __P) +{ + return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, __P, + (__mmask32) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_epu16_mask (__mmask8 __U, __m128i __X, __m128i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, __P, + (__mmask8) __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_epu16_mask (__m128i __X, __m128i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, __P, + (__mmask8) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmp_epu16_mask (__mmask16 __U, __m256i __X, __m256i __Y, + const int __P) +{ + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, __P, + (__mmask16) __U); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_epu16_mask (__m256i __X, __m256i __Y, const int __P) +{ + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, __P, + (__mmask16) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_epu8_mask (__mmask16 __U, __m128i __X, __m128i __Y, + const int __P) +{ + return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, __P, + (__mmask16) __U); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_epu8_mask (__m128i __X, __m128i __Y, const int __P) +{ + return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, __P, + (__mmask16) -1); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmp_epu8_mask (__mmask32 __U, __m256i __X, __m256i __Y, + const int __P) +{ + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, __P, + (__mmask32) __U); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_epu8_mask (__m256i __X, __m256i __Y, const int __P) +{ + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, __P, + (__mmask32) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srli_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_psrlwi256_mask ((__v16hi) __A, __imm, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srli_epi16 (__mmask16 __U, __m256i __A, const int __imm) +{ + return (__m256i) __builtin_ia32_psrlwi256_mask ((__v16hi) __A, __imm, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srli_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_psrlwi128_mask ((__v8hi) __A, __imm, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srli_epi16 (__mmask8 __U, __m128i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_psrlwi128_mask ((__v8hi) __A, __imm, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shufflehi_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_pshufhw256_mask ((__v16hi) __A, + __imm, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shufflehi_epi16 (__mmask16 __U, __m256i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_pshufhw256_mask ((__v16hi) __A, + __imm, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shufflehi_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_pshufhw128_mask ((__v8hi) __A, __imm, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shufflehi_epi16 (__mmask8 __U, __m128i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_pshufhw128_mask ((__v8hi) __A, __imm, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shufflelo_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_pshuflw256_mask ((__v16hi) __A, + __imm, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shufflelo_epi16 (__mmask16 __U, __m256i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_pshuflw256_mask ((__v16hi) __A, + __imm, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shufflelo_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_pshuflw128_mask ((__v8hi) __A, __imm, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shufflelo_epi16 (__mmask8 __U, __m128i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_pshuflw128_mask ((__v8hi) __A, __imm, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srai_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_psrawi256_mask ((__v16hi) __A, __imm, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srai_epi16 (__mmask16 __U, __m256i __A, const int __imm) +{ + return (__m256i) __builtin_ia32_psrawi256_mask ((__v16hi) __A, __imm, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srai_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_psrawi128_mask ((__v8hi) __A, __imm, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srai_epi16 (__mmask8 __U, __m128i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_psrawi128_mask ((__v8hi) __A, __imm, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_slli_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + int __B) +{ + return (__m256i) __builtin_ia32_psllwi256_mask ((__v16hi) __A, __B, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_slli_epi16 (__mmask16 __U, __m256i __A, int __B) +{ + return (__m256i) __builtin_ia32_psllwi256_mask ((__v16hi) __A, __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_slli_epi16 (__m128i __W, __mmask8 __U, __m128i __A, int __B) +{ + return (__m128i) __builtin_ia32_psllwi128_mask ((__v8hi) __A, __B, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, int __B) +{ + return (__m128i) __builtin_ia32_psllwi128_mask ((__v8hi) __A, __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +#else +#define _mm256_mask_alignr_epi8(W, U, X, Y, N) \ + ((__m256i) __builtin_ia32_palignr256_mask ((__v4di)(__m256i)(X), \ + (__v4di)(__m256i)(Y), (int)((N) * 8), \ + (__v4di)(__m256i)(X), (__mmask32)(U))) + +#define _mm256_mask_srli_epi16(W, U, A, B) \ + ((__m256i) __builtin_ia32_psrlwi256_mask ((__v16hi)(__m256i)(A), \ + (int)(B), (__v16hi)(__m256i)(W), (__mmask16)(U))) + +#define _mm256_maskz_srli_epi16(U, A, B) \ + ((__m256i) __builtin_ia32_psrlwi256_mask ((__v16hi)(__m256i)(A), \ + (int)(B), (__v16hi)_mm256_setzero_si256 (), (__mmask16)(U))) + +#define _mm_mask_srli_epi16(W, U, A, B) \ + ((__m128i) __builtin_ia32_psrlwi128_mask ((__v8hi)(__m128i)(A), \ + (int)(B), (__v8hi)(__m128i)(W), (__mmask8)(U))) + +#define _mm_maskz_srli_epi16(U, A, B) \ + ((__m128i) __builtin_ia32_psrlwi128_mask ((__v8hi)(__m128i)(A), \ + (int)(B), (__v8hi)_mm_setzero_si128(), (__mmask8)(U))) + +#define _mm256_mask_srai_epi16(W, U, A, B) \ + ((__m256i) __builtin_ia32_psrawi256_mask ((__v16hi)(__m256i)(A), \ + (int)(B), (__v16hi)(__m256i)(W), (__mmask16)(U))) + +#define _mm256_maskz_srai_epi16(U, A, B) \ + ((__m256i) __builtin_ia32_psrawi256_mask ((__v16hi)(__m256i)(A), \ + (int)(B), (__v16hi)_mm256_setzero_si256 (), (__mmask16)(U))) + +#define _mm_mask_srai_epi16(W, U, A, B) \ + ((__m128i) __builtin_ia32_psrawi128_mask ((__v8hi)(__m128i)(A), \ + (int)(B), (__v8hi)(__m128i)(W), (__mmask8)(U))) + +#define _mm_maskz_srai_epi16(U, A, B) \ + ((__m128i) __builtin_ia32_psrawi128_mask ((__v8hi)(__m128i)(A), \ + (int)(B), (__v8hi)_mm_setzero_si128(), (__mmask8)(U))) + +#define _mm256_mask_shufflehi_epi16(W, U, A, B) \ + ((__m256i) __builtin_ia32_pshufhw256_mask ((__v16hi)(__m256i)(A), (int)(B), \ + (__v16hi)(__m256i)(W), \ + (__mmask16)(U))) + +#define _mm256_maskz_shufflehi_epi16(U, A, B) \ + ((__m256i) __builtin_ia32_pshufhw256_mask ((__v16hi)(__m256i)(A), (int)(B), \ + (__v16hi)(__m256i)_mm256_setzero_si256 (), \ + (__mmask16)(U))) + +#define _mm_mask_shufflehi_epi16(W, U, A, B) \ + ((__m128i) __builtin_ia32_pshufhw128_mask ((__v8hi)(__m128i)(A), (int)(B), \ + (__v8hi)(__m128i)(W), \ + (__mmask8)(U))) + +#define _mm_maskz_shufflehi_epi16(U, A, B) \ + ((__m128i) __builtin_ia32_pshufhw128_mask ((__v8hi)(__m128i)(A), (int)(B), \ + (__v8hi)(__m128i)_mm_setzero_si128 (), \ + (__mmask8)(U))) + +#define _mm256_mask_shufflelo_epi16(W, U, A, B) \ + ((__m256i) __builtin_ia32_pshuflw256_mask ((__v16hi)(__m256i)(A), (int)(B), \ + (__v16hi)(__m256i)(W), \ + (__mmask16)(U))) + +#define _mm256_maskz_shufflelo_epi16(U, A, B) \ + ((__m256i) __builtin_ia32_pshuflw256_mask ((__v16hi)(__m256i)(A), (int)(B), \ + (__v16hi)(__m256i)_mm256_setzero_si256 (), \ + (__mmask16)(U))) + +#define _mm_mask_shufflelo_epi16(W, U, A, B) \ + ((__m128i) __builtin_ia32_pshuflw128_mask ((__v8hi)(__m128i)(A), (int)(B), \ + (__v8hi)(__m128i)(W), \ + (__mmask8)(U))) + +#define _mm_maskz_shufflelo_epi16(U, A, B) \ + ((__m128i) __builtin_ia32_pshuflw128_mask ((__v8hi)(__m128i)(A), (int)(B), \ + (__v8hi)(__m128i)_mm_setzero_si128 (), \ + (__mmask8)(U))) + +#define _mm256_maskz_alignr_epi8(U, X, Y, N) \ + ((__m256i) __builtin_ia32_palignr256_mask ((__v4di)(__m256i)(X), \ + (__v4di)(__m256i)(Y), (int)((N) * 8), \ + (__v4di)(__m256i)_mm256_setzero_si256 (), \ + (__mmask32)(U))) + +#define _mm_mask_alignr_epi8(W, U, X, Y, N) \ + ((__m128i) __builtin_ia32_palignr128_mask ((__v2di)(__m128i)(X), \ + (__v2di)(__m128i)(Y), (int)((N) * 8), \ + (__v2di)(__m128i)(X), (__mmask16)(U))) + +#define _mm_maskz_alignr_epi8(U, X, Y, N) \ + ((__m128i) __builtin_ia32_palignr128_mask ((__v2di)(__m128i)(X), \ + (__v2di)(__m128i)(Y), (int)((N) * 8), \ + (__v2di)(__m128i)_mm_setzero_si128 (), \ + (__mmask16)(U))) + +#define _mm_mask_slli_epi16(W, U, X, C) \ + ((__m128i)__builtin_ia32_psllwi128_mask ((__v8hi)(__m128i)(X), (int)(C),\ + (__v8hi)(__m128i)(W),\ + (__mmask8)(U))) + +#define _mm_maskz_slli_epi16(U, X, C) \ + ((__m128i)__builtin_ia32_psllwi128_mask ((__v8hi)(__m128i)(X), (int)(C),\ + (__v8hi)(__m128i)_mm_setzero_si128 (),\ + (__mmask8)(U))) + +#define _mm256_dbsad_epu8(X, Y, C) \ + ((__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi)(__m256i) (X), \ + (__v32qi)(__m256i) (Y), (int) (C), \ + (__v16hi)(__m256i)_mm256_setzero_si256(),\ + (__mmask16)-1)) + +#define _mm256_mask_slli_epi16(W, U, X, C) \ + ((__m256i)__builtin_ia32_psllwi256_mask ((__v16hi)(__m256i)(X), (int)(C),\ + (__v16hi)(__m256i)(W),\ + (__mmask16)(U))) + +#define _mm256_maskz_slli_epi16(U, X, C) \ + ((__m256i)__builtin_ia32_psllwi256_mask ((__v16hi)(__m256i)(X), (int)(C),\ + (__v16hi)(__m256i)_mm256_setzero_si256 (),\ + (__mmask16)(U))) + +#define _mm256_mask_dbsad_epu8(W, U, X, Y, C) \ + ((__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi)(__m256i) (X), \ + (__v32qi)(__m256i) (Y), (int) (C), \ + (__v16hi)(__m256i)(W), \ + (__mmask16)(U))) + +#define _mm256_maskz_dbsad_epu8(U, X, Y, C) \ + ((__m256i) __builtin_ia32_dbpsadbw256_mask ((__v32qi)(__m256i) (X), \ + (__v32qi)(__m256i) (Y), (int) (C), \ + (__v16hi)(__m256i)_mm256_setzero_si256(),\ + (__mmask16)(U))) + +#define _mm_dbsad_epu8(X, Y, C) \ + ((__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi)(__m128i) (X), \ + (__v16qi)(__m128i) (Y), (int) (C), \ + (__v8hi)(__m128i)_mm_setzero_si128(), \ + (__mmask8)-1)) + +#define _mm_mask_dbsad_epu8(W, U, X, Y, C) \ + ((__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi)(__m128i) (X), \ + (__v16qi)(__m128i) (Y), (int) (C), \ + (__v8hi)(__m128i)(W), \ + (__mmask8)(U))) + +#define _mm_maskz_dbsad_epu8(U, X, Y, C) \ + ((__m128i) __builtin_ia32_dbpsadbw128_mask ((__v16qi)(__m128i) (X), \ + (__v16qi)(__m128i) (Y), (int) (C), \ + (__v8hi)(__m128i)_mm_setzero_si128(), \ + (__mmask8)(U))) + +#define _mm_mask_blend_epi16(__U, __A, __W) \ + ((__m128i) __builtin_ia32_blendmw_128_mask ((__v8hi) (__A), \ + (__v8hi) (__W), \ + (__mmask8) (__U))) + +#define _mm_mask_blend_epi8(__U, __A, __W) \ + ((__m128i) __builtin_ia32_blendmb_128_mask ((__v16qi) (__A), \ + (__v16qi) (__W), \ + (__mmask16) (__U))) + +#define _mm256_mask_blend_epi16(__U, __A, __W) \ + ((__m256i) __builtin_ia32_blendmw_256_mask ((__v16hi) (__A), \ + (__v16hi) (__W), \ + (__mmask16) (__U))) + +#define _mm256_mask_blend_epi8(__U, __A, __W) \ + ((__m256i) __builtin_ia32_blendmb_256_mask ((__v32qi) (__A), \ + (__v32qi) (__W), \ + (__mmask32) (__U))) + +#define _mm_cmp_epi16_mask(X, Y, P) \ + ((__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi)(__m128i)(X), \ + (__v8hi)(__m128i)(Y), (int)(P),\ + (__mmask8)(-1))) + +#define _mm_cmp_epi8_mask(X, Y, P) \ + ((__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (int)(P),\ + (__mmask16)(-1))) + +#define _mm256_cmp_epi16_mask(X, Y, P) \ + ((__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi)(__m256i)(X), \ + (__v16hi)(__m256i)(Y), (int)(P),\ + (__mmask16)(-1))) + +#define _mm256_cmp_epi8_mask(X, Y, P) \ + ((__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi)(__m256i)(X), \ + (__v32qi)(__m256i)(Y), (int)(P),\ + (__mmask32)(-1))) + +#define _mm_cmp_epu16_mask(X, Y, P) \ + ((__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi)(__m128i)(X), \ + (__v8hi)(__m128i)(Y), (int)(P),\ + (__mmask8)(-1))) + +#define _mm_cmp_epu8_mask(X, Y, P) \ + ((__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (int)(P),\ + (__mmask16)(-1))) + +#define _mm256_cmp_epu16_mask(X, Y, P) \ + ((__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi)(__m256i)(X), \ + (__v16hi)(__m256i)(Y), (int)(P),\ + (__mmask16)(-1))) + +#define _mm256_cmp_epu8_mask(X, Y, P) \ + ((__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi)(__m256i)(X), \ + (__v32qi)(__m256i)(Y), (int)(P),\ + (__mmask32)-1)) + +#define _mm_mask_cmp_epi16_mask(M, X, Y, P) \ + ((__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi)(__m128i)(X), \ + (__v8hi)(__m128i)(Y), (int)(P),\ + (__mmask8)(M))) + +#define _mm_mask_cmp_epi8_mask(M, X, Y, P) \ + ((__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (int)(P),\ + (__mmask16)(M))) + +#define _mm256_mask_cmp_epi16_mask(M, X, Y, P) \ + ((__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi)(__m256i)(X), \ + (__v16hi)(__m256i)(Y), (int)(P),\ + (__mmask16)(M))) + +#define _mm256_mask_cmp_epi8_mask(M, X, Y, P) \ + ((__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi)(__m256i)(X), \ + (__v32qi)(__m256i)(Y), (int)(P),\ + (__mmask32)(M))) + +#define _mm_mask_cmp_epu16_mask(M, X, Y, P) \ + ((__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi)(__m128i)(X), \ + (__v8hi)(__m128i)(Y), (int)(P),\ + (__mmask8)(M))) + +#define _mm_mask_cmp_epu8_mask(M, X, Y, P) \ + ((__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (int)(P),\ + (__mmask16)(M))) + +#define _mm256_mask_cmp_epu16_mask(M, X, Y, P) \ + ((__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi)(__m256i)(X), \ + (__v16hi)(__m256i)(Y), (int)(P),\ + (__mmask16)(M))) + +#define _mm256_mask_cmp_epu8_mask(M, X, Y, P) \ + ((__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi)(__m256i)(X), \ + (__v32qi)(__m256i)(Y), (int)(P),\ + (__mmask32)(M))) +#endif + +extern __inline __mmask32 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpneq_epi8_mask (__m256i __X, __m256i __Y) +{ + return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 4, + (__mmask32) -1); +} + +extern __inline __mmask32 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmplt_epi8_mask (__m256i __X, __m256i __Y) +{ + return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 1, + (__mmask32) -1); +} + +extern __inline __mmask32 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpge_epi8_mask (__m256i __X, __m256i __Y) +{ + return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 5, + (__mmask32) -1); +} + +extern __inline __mmask32 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmple_epi8_mask (__m256i __X, __m256i __Y) +{ + return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 2, + (__mmask32) -1); +} + +extern __inline __mmask16 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpneq_epi16_mask (__m256i __X, __m256i __Y) +{ + return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 4, + (__mmask16) -1); +} + +extern __inline __mmask16 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmplt_epi16_mask (__m256i __X, __m256i __Y) +{ + return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 1, + (__mmask16) -1); +} + +extern __inline __mmask16 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpge_epi16_mask (__m256i __X, __m256i __Y) +{ + return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 5, + (__mmask16) -1); +} + +extern __inline __mmask16 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmple_epi16_mask (__m256i __X, __m256i __Y) +{ + return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 2, + (__mmask16) -1); +} + +extern __inline __mmask16 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_epu8_mask (__m128i __X, __m128i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, 4, + (__mmask16) -1); +} + +extern __inline __mmask16 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_epu8_mask (__m128i __X, __m128i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, 1, + (__mmask16) -1); +} + +extern __inline __mmask16 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_epu8_mask (__m128i __X, __m128i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, 5, + (__mmask16) -1); +} + +extern __inline __mmask16 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_epu8_mask (__m128i __X, __m128i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, 2, + (__mmask16) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_epu16_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, 4, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_epu16_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, 1, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_epu16_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, 5, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_epu16_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, 2, + (__mmask8) -1); +} + +extern __inline __mmask16 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_epi8_mask (__m128i __X, __m128i __Y) +{ + return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, 4, + (__mmask16) -1); +} + +extern __inline __mmask16 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_epi8_mask (__m128i __X, __m128i __Y) +{ + return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, 1, + (__mmask16) -1); +} + +extern __inline __mmask16 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_epi8_mask (__m128i __X, __m128i __Y) +{ + return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, 5, + (__mmask16) -1); +} + +extern __inline __mmask16 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_epi8_mask (__m128i __X, __m128i __Y) +{ + return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, 2, + (__mmask16) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_epi16_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, 4, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_epi16_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, 1, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_epi16_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, 5, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_epi16_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, 2, + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mulhrs_epi16 (__m256i __W, __mmask16 __U, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmulhrsw256_mask ((__v16hi) __X, + (__v16hi) __Y, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mulhrs_epi16 (__mmask16 __U, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmulhrsw256_mask ((__v16hi) __X, + (__v16hi) __Y, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mulhi_epu16 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmulhuw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mulhi_epu16 (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmulhuw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mulhi_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmulhw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mulhi_epi16 (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmulhw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mulhi_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmulhw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mulhi_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmulhw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mulhi_epu16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmulhuw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mulhi_epu16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmulhuw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mulhrs_epi16 (__m128i __W, __mmask8 __U, __m128i __X, + __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmulhrsw128_mask ((__v8hi) __X, + (__v8hi) __Y, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mulhrs_epi16 (__mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmulhrsw128_mask ((__v8hi) __X, + (__v8hi) __Y, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mullo_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmullw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mullo_epi16 (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmullw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mullo_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmullw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mullo_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmullw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi8_epi16 (__m256i __W, __mmask16 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovsxbw256_mask ((__v16qi) __A, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi8_epi16 (__mmask16 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovsxbw256_mask ((__v16qi) __A, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi8_epi16 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsxbw128_mask ((__v16qi) __A, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi8_epi16 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsxbw128_mask ((__v16qi) __A, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepu8_epi16 (__m256i __W, __mmask16 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovzxbw256_mask ((__v16qi) __A, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovzxbw256_mask ((__v16qi) __A, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepu8_epi16 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovzxbw128_mask ((__v16qi) __A, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepu8_epi16 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovzxbw128_mask ((__v16qi) __A, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_avg_epu8 (__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pavgb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) __W, + (__mmask32) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_avg_epu8 (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pavgb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_avg_epu8 (__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pavgb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) __W, + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_avg_epu8 (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pavgb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_avg_epu16 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pavgw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_avg_epu16 (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pavgw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_avg_epu16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pavgw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_avg_epu16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pavgw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_add_epi8 (__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_paddb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) __W, + (__mmask32) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_add_epi8 (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_paddb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_add_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_paddw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_add_epi16 (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_paddw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_adds_epi8 (__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_paddsb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) __W, + (__mmask32) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_adds_epi8 (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_paddsb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_adds_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_paddsw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_adds_epi16 (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_paddsw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_adds_epu8 (__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_paddusb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) __W, + (__mmask32) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_adds_epu8 (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_paddusb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_adds_epu16 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_paddusw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_adds_epu16 (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_paddusw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sub_epi8 (__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_psubb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) __W, + (__mmask32) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sub_epi8 (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_psubb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sub_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_psubw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sub_epi16 (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_psubw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_subs_epi8 (__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_psubsb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) __W, + (__mmask32) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_subs_epi8 (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_psubsb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_subs_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_psubsw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_subs_epi16 (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_psubsw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_subs_epu8 (__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_psubusb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) __W, + (__mmask32) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_subs_epu8 (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_psubusb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_subs_epu16 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_psubusw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_subs_epu16 (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_psubusw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_epi8 (__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_paddb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) __W, + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_epi8 (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_paddb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_paddw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_paddw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_unpackhi_epi8 (__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_punpckhbw256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) __W, + (__mmask32) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_unpackhi_epi8 (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_punpckhbw256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_unpackhi_epi8 (__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_punpckhbw128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) __W, + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_unpackhi_epi8 (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_punpckhbw128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_unpackhi_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_punpckhwd256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_unpackhi_epi16 (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_punpckhwd256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_unpackhi_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_punpckhwd128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_unpackhi_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_punpckhwd128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_unpacklo_epi8 (__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_punpcklbw256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) __W, + (__mmask32) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_unpacklo_epi8 (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_punpcklbw256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_unpacklo_epi8 (__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_punpcklbw128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) __W, + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_unpacklo_epi8 (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_punpcklbw128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_unpacklo_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_punpcklwd256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_unpacklo_epi16 (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_punpcklwd256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_unpacklo_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_punpcklwd128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_unpacklo_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_punpcklwd128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epi8_mask (__m128i __A, __m128i __B) +{ + return (__mmask16) __builtin_ia32_pcmpeqb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__mmask16) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epu8_mask (__m128i __A, __m128i __B) +{ + return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __A, + (__v16qi) __B, 0, + (__mmask16) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpeq_epu8_mask (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __A, + (__v16qi) __B, 0, + __U); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpeq_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__mmask16) __builtin_ia32_pcmpeqb128_mask ((__v16qi) __A, + (__v16qi) __B, + __U); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epu8_mask (__m256i __A, __m256i __B) +{ + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __A, + (__v32qi) __B, 0, + (__mmask32) -1); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epi8_mask (__m256i __A, __m256i __B) +{ + return (__mmask32) __builtin_ia32_pcmpeqb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__mmask32) -1); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpeq_epu8_mask (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __A, + (__v32qi) __B, 0, + __U); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpeq_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__mmask32) __builtin_ia32_pcmpeqb256_mask ((__v32qi) __A, + (__v32qi) __B, + __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epu16_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __A, + (__v8hi) __B, 0, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epi16_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpeq_epu16_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __A, + (__v8hi) __B, 0, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpeq_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqw128_mask ((__v8hi) __A, + (__v8hi) __B, __U); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epu16_mask (__m256i __A, __m256i __B) +{ + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __A, + (__v16hi) __B, 0, + (__mmask16) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epi16_mask (__m256i __A, __m256i __B) +{ + return (__mmask16) __builtin_ia32_pcmpeqw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__mmask16) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpeq_epu16_mask (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __A, + (__v16hi) __B, 0, + __U); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpeq_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__mmask16) __builtin_ia32_pcmpeqw256_mask ((__v16hi) __A, + (__v16hi) __B, + __U); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epu8_mask (__m128i __A, __m128i __B) +{ + return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __A, + (__v16qi) __B, 6, + (__mmask16) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epi8_mask (__m128i __A, __m128i __B) +{ + return (__mmask16) __builtin_ia32_pcmpgtb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__mmask16) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpgt_epu8_mask (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __A, + (__v16qi) __B, 6, + __U); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpgt_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__mmask16) __builtin_ia32_pcmpgtb128_mask ((__v16qi) __A, + (__v16qi) __B, + __U); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epu8_mask (__m256i __A, __m256i __B) +{ + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __A, + (__v32qi) __B, 6, + (__mmask32) -1); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epi8_mask (__m256i __A, __m256i __B) +{ + return (__mmask32) __builtin_ia32_pcmpgtb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__mmask32) -1); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpgt_epu8_mask (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __A, + (__v32qi) __B, 6, + __U); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpgt_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__mmask32) __builtin_ia32_pcmpgtb256_mask ((__v32qi) __A, + (__v32qi) __B, + __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epu16_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __A, + (__v8hi) __B, 6, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epi16_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpgt_epu16_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __A, + (__v8hi) __B, 6, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpgt_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtw128_mask ((__v8hi) __A, + (__v8hi) __B, __U); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epu16_mask (__m256i __A, __m256i __B) +{ + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __A, + (__v16hi) __B, 6, + (__mmask16) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epi16_mask (__m256i __A, __m256i __B) +{ + return (__mmask16) __builtin_ia32_pcmpgtw256_mask ((__v16hi) __A, + (__v16hi) __B, + (__mmask16) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpgt_epu16_mask (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __A, + (__v16hi) __B, 6, + __U); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpgt_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__mmask16) __builtin_ia32_pcmpgtw256_mask ((__v16hi) __A, + (__v16hi) __B, + __U); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testn_epi8_mask (__m128i __A, __m128i __B) +{ + return (__mmask16) __builtin_ia32_ptestnmb128 ((__v16qi) __A, + (__v16qi) __B, + (__mmask16) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_testn_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__mmask16) __builtin_ia32_ptestnmb128 ((__v16qi) __A, + (__v16qi) __B, __U); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testn_epi8_mask (__m256i __A, __m256i __B) +{ + return (__mmask32) __builtin_ia32_ptestnmb256 ((__v32qi) __A, + (__v32qi) __B, + (__mmask32) -1); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_testn_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__mmask32) __builtin_ia32_ptestnmb256 ((__v32qi) __A, + (__v32qi) __B, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testn_epi16_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmw128 ((__v8hi) __A, + (__v8hi) __B, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_testn_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmw128 ((__v8hi) __A, + (__v8hi) __B, __U); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testn_epi16_mask (__m256i __A, __m256i __B) +{ + return (__mmask16) __builtin_ia32_ptestnmw256 ((__v16hi) __A, + (__v16hi) __B, + (__mmask16) -1); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_testn_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__mmask16) __builtin_ia32_ptestnmw256 ((__v16hi) __A, + (__v16hi) __B, __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shuffle_epi8 (__m256i __W, __mmask32 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pshufb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) __W, + (__mmask32) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shuffle_epi8 (__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pshufb256_mask ((__v32qi) __A, + (__v32qi) __B, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shuffle_epi8 (__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pshufb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) __W, + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shuffle_epi8 (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pshufb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_packs_epi16 (__mmask32 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_packsswb256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v32qi) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_packs_epi16 (__m256i __W, __mmask32 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_packsswb256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v32qi) __W, + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_packs_epi16 (__mmask16 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_packsswb128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_packs_epi16 (__m128i __W, __mmask16 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_packsswb128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v16qi) __W, + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_packus_epi16 (__mmask32 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_packuswb256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v32qi) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_packus_epi16 (__m256i __W, __mmask32 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_packuswb256_mask ((__v16hi) __A, + (__v16hi) __B, + (__v32qi) __W, + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_packus_epi16 (__mmask16 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_packuswb128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_packus_epi16 (__m128i __W, __mmask16 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_packuswb128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v16qi) __W, + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_abs_epi8 (__m256i __W, __mmask32 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_pabsb256_mask ((__v32qi) __A, + (__v32qi) __W, + (__mmask32) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_abs_epi8 (__mmask32 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_pabsb256_mask ((__v32qi) __A, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_abs_epi8 (__m128i __W, __mmask16 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pabsb128_mask ((__v16qi) __A, + (__v16qi) __W, + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_abs_epi8 (__mmask16 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pabsb128_mask ((__v16qi) __A, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_abs_epi16 (__m256i __W, __mmask16 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_pabsw256_mask ((__v16hi) __A, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_abs_epi16 (__mmask16 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_pabsw256_mask ((__v16hi) __A, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_abs_epi16 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pabsw128_mask ((__v8hi) __A, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_abs_epi16 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pabsw128_mask ((__v8hi) __A, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __mmask32 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpneq_epu8_mask (__m256i __X, __m256i __Y) +{ + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 4, + (__mmask32) -1); +} + +extern __inline __mmask32 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmplt_epu8_mask (__m256i __X, __m256i __Y) +{ + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 1, + (__mmask32) -1); +} + +extern __inline __mmask32 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpge_epu8_mask (__m256i __X, __m256i __Y) +{ + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 5, + (__mmask32) -1); +} + +extern __inline __mmask32 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmple_epu8_mask (__m256i __X, __m256i __Y) +{ + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 2, + (__mmask32) -1); +} + +extern __inline __mmask16 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpneq_epu16_mask (__m256i __X, __m256i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 4, + (__mmask16) -1); +} + +extern __inline __mmask16 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmplt_epu16_mask (__m256i __X, __m256i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 1, + (__mmask16) -1); +} + +extern __inline __mmask16 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpge_epu16_mask (__m256i __X, __m256i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 5, + (__mmask16) -1); +} + +extern __inline __mmask16 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmple_epu16_mask (__m256i __X, __m256i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 2, + (__mmask16) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_storeu_epi16 (void *__P, __m256i __A) +{ + *(__v16hi_u *) __P = (__v16hi_u) __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_storeu_epi16 (void *__P, __mmask16 __U, __m256i __A) +{ + __builtin_ia32_storedquhi256_mask ((short *) __P, + (__v16hi) __A, + (__mmask16) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_epi16 (void *__P, __m128i __A) +{ + *(__v8hi_u *) __P = (__v8hi_u) __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_storeu_epi16 (void *__P, __mmask8 __U, __m128i __A) +{ + __builtin_ia32_storedquhi128_mask ((short *) __P, + (__v8hi) __A, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_adds_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_paddsw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_subs_epi8 (__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psubsb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) __W, + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_subs_epi8 (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psubsb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_subs_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psubsw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_subs_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psubsw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_subs_epu8 (__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psubusb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) __W, + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_subs_epu8 (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psubusb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_subs_epu16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psubusw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_subs_epu16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psubusw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srl_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + __m128i __B) +{ + return (__m256i) __builtin_ia32_psrlw256_mask ((__v16hi) __A, + (__v8hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srl_epi16 (__mmask16 __U, __m256i __A, __m128i __B) +{ + return (__m256i) __builtin_ia32_psrlw256_mask ((__v16hi) __A, + (__v8hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srl_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psrlw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srl_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psrlw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sra_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + __m128i __B) +{ + return (__m256i) __builtin_ia32_psraw256_mask ((__v16hi) __A, + (__v8hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sra_epi16 (__mmask16 __U, __m256i __A, __m128i __B) +{ + return (__m256i) __builtin_ia32_psraw256_mask ((__v16hi) __A, + (__v8hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sra_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psraw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sra_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psraw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_adds_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_paddsw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_adds_epu8 (__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_paddusb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) __W, + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_adds_epu8 (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_paddusb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_adds_epu16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_paddusw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_adds_epu16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_paddusw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_epi8 (__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psubb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) __W, + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_epi8 (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psubb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psubw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psubw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_adds_epi8 (__m128i __W, __mmask16 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_paddsb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) __W, + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_adds_epi8 (__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_paddsb128_mask ((__v16qi) __A, + (__v16qi) __B, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi16_epi8 (__m128i __A) +{ + + return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A, + (__v16qi)_mm_undefined_si128(), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi16_storeu_epi8 (void * __P, __mmask8 __M,__m128i __A) +{ + __builtin_ia32_pmovwb128mem_mask ((unsigned long long *) __P , (__v8hi) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A, + (__v16qi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi16_epi8 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srav_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_psrav16hi_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srav_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_psrav16hi_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srav_epi16 (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_psrav16hi_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srav_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psrav8hi_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srav_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psrav8hi_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srav_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psrav8hi_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srlv_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_psrlv16hi_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srlv_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_psrlv16hi_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srlv_epi16 (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_psrlv16hi_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srlv_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psrlv8hi_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srlv_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psrlv8hi_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srlv_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psrlv8hi_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sllv_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_psllv16hi_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sllv_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_psllv16hi_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sllv_epi16 (__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_psllv16hi_mask ((__v16hi) __A, + (__v16hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sllv_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psllv8hi_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sllv_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psllv8hi_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sllv_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psllv8hi_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sll_epi16 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psllw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sll_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psllw128_mask ((__v8hi) __A, + (__v8hi) __B, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sll_epi16 (__m256i __W, __mmask16 __U, __m256i __A, + __m128i __B) +{ + return (__m256i) __builtin_ia32_psllw256_mask ((__v16hi) __A, + (__v8hi) __B, + (__v16hi) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sll_epi16 (__mmask16 __U, __m256i __A, __m128i __B) +{ + return (__m256i) __builtin_ia32_psllw256_mask ((__v16hi) __A, + (__v8hi) __B, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_packus_epi32 (__mmask16 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_packusdw256_mask ((__v8si) __A, + (__v8si) __B, + (__v16hi) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_packus_epi32 (__m256i __W, __mmask16 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_packusdw256_mask ((__v8si) __A, + (__v8si) __B, + (__v16hi) __W, + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_packus_epi32 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_packusdw128_mask ((__v4si) __A, + (__v4si) __B, + (__v8hi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_packus_epi32 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_packusdw128_mask ((__v4si) __A, + (__v4si) __B, + (__v8hi) __W, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_packs_epi32 (__mmask16 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_packssdw256_mask ((__v8si) __A, + (__v8si) __B, + (__v16hi) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_packs_epi32 (__m256i __W, __mmask16 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_packssdw256_mask ((__v8si) __A, + (__v8si) __B, + (__v16hi) __W, + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_packs_epi32 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_packssdw128_mask ((__v4si) __A, + (__v4si) __B, + (__v8hi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_packs_epi32 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_packssdw128_mask ((__v4si) __A, + (__v4si) __B, + (__v8hi) __W, __M); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpneq_epu8_mask (__mmask16 __M, __m128i __X, __m128i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, 4, + (__mmask16) __M); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmplt_epu8_mask (__mmask16 __M, __m128i __X, __m128i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, 1, + (__mmask16) __M); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpge_epu8_mask (__mmask16 __M, __m128i __X, __m128i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, 5, + (__mmask16) __M); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmple_epu8_mask (__mmask16 __M, __m128i __X, __m128i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, 2, + (__mmask16) __M); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpneq_epu16_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, 4, + (__mmask8) __M); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmplt_epu16_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, 1, + (__mmask8) __M); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpge_epu16_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, 5, + (__mmask8) __M); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmple_epu16_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, 2, + (__mmask8) __M); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpneq_epi8_mask (__mmask16 __M, __m128i __X, __m128i __Y) +{ + return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, 4, + (__mmask16) __M); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmplt_epi8_mask (__mmask16 __M, __m128i __X, __m128i __Y) +{ + return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, 1, + (__mmask16) __M); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpge_epi8_mask (__mmask16 __M, __m128i __X, __m128i __Y) +{ + return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, 5, + (__mmask16) __M); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmple_epi8_mask (__mmask16 __M, __m128i __X, __m128i __Y) +{ + return (__mmask16) __builtin_ia32_cmpb128_mask ((__v16qi) __X, + (__v16qi) __Y, 2, + (__mmask16) __M); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpneq_epi16_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, 4, + (__mmask8) __M); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmplt_epi16_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, 1, + (__mmask8) __M); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpge_epi16_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, 5, + (__mmask8) __M); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmple_epi16_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpw128_mask ((__v8hi) __X, + (__v8hi) __Y, 2, + (__mmask8) __M); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpneq_epu8_mask (__mmask32 __M, __m256i __X, __m256i __Y) +{ + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 4, + (__mmask32) __M); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmplt_epu8_mask (__mmask32 __M, __m256i __X, __m256i __Y) +{ + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 1, + (__mmask32) __M); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpge_epu8_mask (__mmask32 __M, __m256i __X, __m256i __Y) +{ + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 5, + (__mmask32) __M); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmple_epu8_mask (__mmask32 __M, __m256i __X, __m256i __Y) +{ + return (__mmask32) __builtin_ia32_ucmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 2, + (__mmask32) __M); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpneq_epu16_mask (__mmask16 __M, __m256i __X, __m256i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 4, + (__mmask16) __M); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmplt_epu16_mask (__mmask16 __M, __m256i __X, __m256i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 1, + (__mmask16) __M); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpge_epu16_mask (__mmask16 __M, __m256i __X, __m256i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 5, + (__mmask16) __M); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmple_epu16_mask (__mmask16 __M, __m256i __X, __m256i __Y) +{ + return (__mmask16) __builtin_ia32_ucmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 2, + (__mmask16) __M); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpneq_epi8_mask (__mmask32 __M, __m256i __X, __m256i __Y) +{ + return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 4, + (__mmask32) __M); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmplt_epi8_mask (__mmask32 __M, __m256i __X, __m256i __Y) +{ + return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 1, + (__mmask32) __M); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpge_epi8_mask (__mmask32 __M, __m256i __X, __m256i __Y) +{ + return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 5, + (__mmask32) __M); +} + +extern __inline __mmask32 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmple_epi8_mask (__mmask32 __M, __m256i __X, __m256i __Y) +{ + return (__mmask32) __builtin_ia32_cmpb256_mask ((__v32qi) __X, + (__v32qi) __Y, 2, + (__mmask32) __M); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpneq_epi16_mask (__mmask16 __M, __m256i __X, __m256i __Y) +{ + return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 4, + (__mmask16) __M); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmplt_epi16_mask (__mmask16 __M, __m256i __X, __m256i __Y) +{ + return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 1, + (__mmask16) __M); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpge_epi16_mask (__mmask16 __M, __m256i __X, __m256i __Y) +{ + return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 5, + (__mmask16) __M); +} + +extern __inline __mmask16 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmple_epi16_mask (__mmask16 __M, __m256i __X, __m256i __Y) +{ + return (__mmask16) __builtin_ia32_cmpw256_mask ((__v16hi) __X, + (__v16hi) __Y, 2, + (__mmask16) __M); +} + +#ifdef __DISABLE_AVX512VLBW__ +#undef __DISABLE_AVX512VLBW__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512VLBW__ */ + +#endif /* _AVX512VLBWINTRIN_H_INCLUDED */ diff --git a/include-gcc/avx512vldqintrin.h b/include-gcc/avx512vldqintrin.h new file mode 100644 index 0000000..be4d59c --- /dev/null +++ b/include-gcc/avx512vldqintrin.h @@ -0,0 +1,2016 @@ +/* Copyright (C) 2014-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512VLDQINTRIN_H_INCLUDED +#define _AVX512VLDQINTRIN_H_INCLUDED + +#if !defined(__AVX512VL__) || !defined(__AVX512DQ__) +#pragma GCC push_options +#pragma GCC target("avx512vl,avx512dq") +#define __DISABLE_AVX512VLDQ__ +#endif /* __AVX512VLDQ__ */ + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvttpd_epi64 (__m256d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvttpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvttpd_epi64 (__mmask8 __U, __m256d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttpd_epi64 (__m128d __A) +{ + return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvttpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvttpd_epi64 (__mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvttpd_epu64 (__m256d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvttpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvttpd_epu64 (__mmask8 __U, __m256d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttpd_epu64 (__m128d __A) +{ + return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvttpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvttpd_epu64 (__mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtpd_epi64 (__m256d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtpd_epi64 (__mmask8 __U, __m256d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpd_epi64 (__m128d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtpd_epi64 (__mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtpd_epu64 (__m256d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtpd_epu64 (__mmask8 __U, __m256d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpd_epu64 (__m128d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtpd_epu64 (__mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvttps_epi64 (__m128 __A) +{ + return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvttps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) +{ + return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) +{ + return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttps_epi64 (__m128 __A) +{ + return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvttps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvttps_epu64 (__m128 __A) +{ + return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvttps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) +{ + return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) +{ + return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttps_epu64 (__m128 __A) +{ + return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvttps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcast_f64x2 (__m128d __A) +{ + return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) + __A, + (__v4df)_mm256_undefined_pd(), + (__mmask8) -1); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_broadcast_f64x2 (__m256d __O, __mmask8 __M, __m128d __A) +{ + return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) + __A, + (__v4df) + __O, __M); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A) +{ + return (__m256d) __builtin_ia32_broadcastf64x2_256_mask ((__v2df) + __A, + (__v4df) + _mm256_setzero_ps (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcast_i64x2 (__m128i __A) +{ + return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) + __A, + (__v4di)_mm256_undefined_si256(), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_broadcast_i64x2 (__m256i __O, __mmask8 __M, __m128i __A) +{ + return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) + __A, + (__v4di) + __O, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A) +{ + return (__m256i) __builtin_ia32_broadcasti64x2_256_mask ((__v2di) + __A, + (__v4di) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcast_f32x2 (__m128 __A) +{ + return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A, + (__v8sf)_mm256_undefined_ps(), + (__mmask8) -1); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_broadcast_f32x2 (__m256 __O, __mmask8 __M, __m128 __A) +{ + return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A, + (__v8sf) __O, + __M); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A) +{ + return (__m256) __builtin_ia32_broadcastf32x2_256_mask ((__v4sf) __A, + (__v8sf) + _mm256_setzero_ps (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcast_i32x2 (__m128i __A) +{ + return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) + __A, + (__v8si)_mm256_undefined_si256(), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_broadcast_i32x2 (__m256i __O, __mmask8 __M, __m128i __A) +{ + return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) + __A, + (__v8si) + __O, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A) +{ + return (__m256i) __builtin_ia32_broadcasti32x2_256_mask ((__v4si) + __A, + (__v8si) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcast_i32x2 (__m128i __A) +{ + return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) + __A, + (__v4si)_mm_undefined_si128(), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_broadcast_i32x2 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) + __A, + (__v4si) + __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_broadcasti32x2_128_mask ((__v4si) + __A, + (__v4si) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mullo_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v4du) __A * (__v4du) __B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mullo_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mullo_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmullq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mullo_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v2du) __A * (__v2du) __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mullo_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mullo_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmullq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_andnot_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) +{ + return (__m256d) __builtin_ia32_andnpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_andnot_pd (__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_andnpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_andnot_pd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) +{ + return (__m128d) __builtin_ia32_andnpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_andnot_pd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_andnpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_andnot_ps (__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B) +{ + return (__m256) __builtin_ia32_andnps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_andnot_ps (__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_andnps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_andnot_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_andnps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_andnot_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_andnps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtps_epi64 (__m128 __A) +{ + return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) +{ + return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) +{ + return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtps_epi64 (__m128 __A) +{ + return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtps_epu64 (__m128 __A) +{ + return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) +{ + return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) +{ + return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtps_epu64 (__m128 __A) +{ + return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi64_ps (__m256i __A) +{ + return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m256i __A) +{ + return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi64_ps (__mmask8 __U, __m256i __A) +{ + return (__m128) __builtin_ia32_cvtqq2ps256_mask ((__v4di) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi64_ps (__m128i __A) +{ + return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m128i __A) +{ + return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi64_ps (__mmask8 __U, __m128i __A) +{ + return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu64_ps (__m256i __A) +{ + return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m256i __A) +{ + return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepu64_ps (__mmask8 __U, __m256i __A) +{ + return (__m128) __builtin_ia32_cvtuqq2ps256_mask ((__v4di) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu64_ps (__m128i __A) +{ + return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m128i __A) +{ + return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepu64_ps (__mmask8 __U, __m128i __A) +{ + return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi64_pd (__m256i __A) +{ + return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi64_pd (__m256d __W, __mmask8 __U, __m256i __A) +{ + return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi64_pd (__mmask8 __U, __m256i __A) +{ + return (__m256d) __builtin_ia32_cvtqq2pd256_mask ((__v4di) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi64_pd (__m128i __A) +{ + return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi64_pd (__m128d __W, __mmask8 __U, __m128i __A) +{ + return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi64_pd (__mmask8 __U, __m128i __A) +{ + return (__m128d) __builtin_ia32_cvtqq2pd128_mask ((__v2di) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu64_pd (__m256i __A) +{ + return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepu64_pd (__m256d __W, __mmask8 __U, __m256i __A) +{ + return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepu64_pd (__mmask8 __U, __m256i __A) +{ + return (__m256d) __builtin_ia32_cvtuqq2pd256_mask ((__v4di) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_and_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) +{ + return (__m256d) __builtin_ia32_andpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_and_pd (__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_andpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_and_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_andpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_and_pd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_andpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_and_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_andps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_and_ps (__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_andps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_and_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_andps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_and_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_andps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu64_pd (__m128i __A) +{ + return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepu64_pd (__m128d __W, __mmask8 __U, __m128i __A) +{ + return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepu64_pd (__mmask8 __U, __m128i __A) +{ + return (__m128d) __builtin_ia32_cvtuqq2pd128_mask ((__v2di) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_xor_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) +{ + return (__m256d) __builtin_ia32_xorpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_xor_pd (__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_xorpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_xor_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_xorpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_xor_pd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_xorpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_xor_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_xorps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_xor_ps (__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_xorps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_xor_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_xorps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_xor_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_xorps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_or_pd (__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_orpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_or_pd (__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_orpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_or_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_orpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_or_pd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_orpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_or_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_orps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_or_ps (__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_orps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_or_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_orps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_or_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_orps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movm_epi32 (__mmask8 __A) +{ + return (__m128i) __builtin_ia32_cvtmask2d128 (__A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_movm_epi32 (__mmask8 __A) +{ + return (__m256i) __builtin_ia32_cvtmask2d256 (__A); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movm_epi64 (__mmask8 __A) +{ + return (__m128i) __builtin_ia32_cvtmask2q128 (__A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_movm_epi64 (__mmask8 __A) +{ + return (__m256i) __builtin_ia32_cvtmask2q256 (__A); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movepi32_mask (__m128i __A) +{ + return (__mmask8) __builtin_ia32_cvtd2mask128 ((__v4si) __A); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_movepi32_mask (__m256i __A) +{ + return (__mmask8) __builtin_ia32_cvtd2mask256 ((__v8si) __A); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movepi64_mask (__m128i __A) +{ + return (__mmask8) __builtin_ia32_cvtq2mask128 ((__v2di) __A); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_movepi64_mask (__m256i __A) +{ + return (__mmask8) __builtin_ia32_cvtq2mask256 ((__v4di) __A); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extractf64x2_pd (__m256d __A, const int __imm) +{ + return (__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df) __A, + __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_extractf64x2_pd (__m128d __W, __mmask8 __U, __m256d __A, + const int __imm) +{ + return (__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df) __A, + __imm, + (__v2df) __W, + (__mmask8) + __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_extractf64x2_pd (__mmask8 __U, __m256d __A, + const int __imm) +{ + return (__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df) __A, + __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) + __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extracti64x2_epi64 (__m256i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di) __A, + __imm, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_extracti64x2_epi64 (__m128i __W, __mmask8 __U, __m256i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di) __A, + __imm, + (__v2di) __W, + (__mmask8) + __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_extracti64x2_epi64 (__mmask8 __U, __m256i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di) __A, + __imm, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) + __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reduce_pd (__m256d __A, int __B) +{ + return (__m256d) __builtin_ia32_reducepd256_mask ((__v4df) __A, __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_reduce_pd (__m256d __W, __mmask8 __U, __m256d __A, int __B) +{ + return (__m256d) __builtin_ia32_reducepd256_mask ((__v4df) __A, __B, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_reduce_pd (__mmask8 __U, __m256d __A, int __B) +{ + return (__m256d) __builtin_ia32_reducepd256_mask ((__v4df) __A, __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_pd (__m128d __A, int __B) +{ + return (__m128d) __builtin_ia32_reducepd128_mask ((__v2df) __A, __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_pd (__m128d __W, __mmask8 __U, __m128d __A, int __B) +{ + return (__m128d) __builtin_ia32_reducepd128_mask ((__v2df) __A, __B, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_reduce_pd (__mmask8 __U, __m128d __A, int __B) +{ + return (__m128d) __builtin_ia32_reducepd128_mask ((__v2df) __A, __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_reduce_ps (__m256 __A, int __B) +{ + return (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_reduce_ps (__m256 __W, __mmask8 __U, __m256 __A, int __B) +{ + return (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_reduce_ps (__mmask8 __U, __m256 __A, int __B) +{ + return (__m256) __builtin_ia32_reduceps256_mask ((__v8sf) __A, __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_reduce_ps (__m128 __A, int __B) +{ + return (__m128) __builtin_ia32_reduceps128_mask ((__v4sf) __A, __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_reduce_ps (__m128 __W, __mmask8 __U, __m128 __A, int __B) +{ + return (__m128) __builtin_ia32_reduceps128_mask ((__v4sf) __A, __B, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_reduce_ps (__mmask8 __U, __m128 __A, int __B) +{ + return (__m128) __builtin_ia32_reduceps128_mask ((__v4sf) __A, __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_range_pd (__m256d __A, __m256d __B, int __C) +{ + return (__m256d) __builtin_ia32_rangepd256_mask ((__v4df) __A, + (__v4df) __B, __C, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_range_pd (__m256d __W, __mmask8 __U, + __m256d __A, __m256d __B, int __C) +{ + return (__m256d) __builtin_ia32_rangepd256_mask ((__v4df) __A, + (__v4df) __B, __C, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_range_pd (__mmask8 __U, __m256d __A, __m256d __B, int __C) +{ + return (__m256d) __builtin_ia32_rangepd256_mask ((__v4df) __A, + (__v4df) __B, __C, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_range_pd (__m128d __A, __m128d __B, int __C) +{ + return (__m128d) __builtin_ia32_rangepd128_mask ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_range_pd (__m128d __W, __mmask8 __U, + __m128d __A, __m128d __B, int __C) +{ + return (__m128d) __builtin_ia32_rangepd128_mask ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_range_pd (__mmask8 __U, __m128d __A, __m128d __B, int __C) +{ + return (__m128d) __builtin_ia32_rangepd128_mask ((__v2df) __A, + (__v2df) __B, __C, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_range_ps (__m256 __A, __m256 __B, int __C) +{ + return (__m256) __builtin_ia32_rangeps256_mask ((__v8sf) __A, + (__v8sf) __B, __C, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_range_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B, + int __C) +{ + return (__m256) __builtin_ia32_rangeps256_mask ((__v8sf) __A, + (__v8sf) __B, __C, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_range_ps (__mmask8 __U, __m256 __A, __m256 __B, int __C) +{ + return (__m256) __builtin_ia32_rangeps256_mask ((__v8sf) __A, + (__v8sf) __B, __C, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_range_ps (__m128 __A, __m128 __B, int __C) +{ + return (__m128) __builtin_ia32_rangeps128_mask ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_range_ps (__m128 __W, __mmask8 __U, + __m128 __A, __m128 __B, int __C) +{ + return (__m128) __builtin_ia32_rangeps128_mask ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_range_ps (__mmask8 __U, __m128 __A, __m128 __B, int __C) +{ + return (__m128) __builtin_ia32_rangeps128_mask ((__v4sf) __A, + (__v4sf) __B, __C, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fpclass_pd_mask (__mmask8 __U, __m256d __A, + const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclasspd256_mask ((__v4df) __A, + __imm, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fpclass_pd_mask (__m256d __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclasspd256_mask ((__v4df) __A, + __imm, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fpclass_ps_mask (__mmask8 __U, __m256 __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclassps256_mask ((__v8sf) __A, + __imm, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fpclass_ps_mask (__m256 __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclassps256_mask ((__v8sf) __A, + __imm, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fpclass_pd_mask (__mmask8 __U, __m128d __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclasspd128_mask ((__v2df) __A, + __imm, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fpclass_pd_mask (__m128d __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclasspd128_mask ((__v2df) __A, + __imm, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fpclass_ps_mask (__mmask8 __U, __m128 __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclassps128_mask ((__v4sf) __A, + __imm, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fpclass_ps_mask (__m128 __A, const int __imm) +{ + return (__mmask8) __builtin_ia32_fpclassps128_mask ((__v4sf) __A, + __imm, + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_inserti64x2 (__m256i __A, __m128i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di) __A, + (__v2di) __B, + __imm, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_inserti64x2 (__m256i __W, __mmask8 __U, __m256i __A, + __m128i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di) __A, + (__v2di) __B, + __imm, + (__v4di) __W, + (__mmask8) + __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_inserti64x2 (__mmask8 __U, __m256i __A, __m128i __B, + const int __imm) +{ + return (__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di) __A, + (__v2di) __B, + __imm, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) + __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insertf64x2 (__m256d __A, __m128d __B, const int __imm) +{ + return (__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df) __A, + (__v2df) __B, + __imm, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_insertf64x2 (__m256d __W, __mmask8 __U, __m256d __A, + __m128d __B, const int __imm) +{ + return (__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df) __A, + (__v2df) __B, + __imm, + (__v4df) __W, + (__mmask8) + __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_insertf64x2 (__mmask8 __U, __m256d __A, __m128d __B, + const int __imm) +{ + return (__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df) __A, + (__v2df) __B, + __imm, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) + __U); +} + +#else +#define _mm256_insertf64x2(X, Y, C) \ + ((__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df)(__m256d) (X),\ + (__v2df)(__m128d) (Y), (int) (C), \ + (__v4df)(__m256d)_mm256_setzero_pd(), \ + (__mmask8)-1)) + +#define _mm256_mask_insertf64x2(W, U, X, Y, C) \ + ((__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df)(__m256d) (X),\ + (__v2df)(__m128d) (Y), (int) (C), \ + (__v4df)(__m256d)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_insertf64x2(U, X, Y, C) \ + ((__m256d) __builtin_ia32_insertf64x2_256_mask ((__v4df)(__m256d) (X),\ + (__v2df)(__m128d) (Y), (int) (C), \ + (__v4df)(__m256d)_mm256_setzero_pd(), \ + (__mmask8)(U))) + +#define _mm256_inserti64x2(X, Y, C) \ + ((__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di)(__m256i) (X),\ + (__v2di)(__m128i) (Y), (int) (C), \ + (__v4di)(__m256i)_mm256_setzero_si256 (), \ + (__mmask8)-1)) + +#define _mm256_mask_inserti64x2(W, U, X, Y, C) \ + ((__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di)(__m256i) (X),\ + (__v2di)(__m128i) (Y), (int) (C), \ + (__v4di)(__m256i)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_inserti64x2(U, X, Y, C) \ + ((__m256i) __builtin_ia32_inserti64x2_256_mask ((__v4di)(__m256i) (X),\ + (__v2di)(__m128i) (Y), (int) (C), \ + (__v4di)(__m256i)_mm256_setzero_si256 (), \ + (__mmask8)(U))) + +#define _mm256_extractf64x2_pd(X, C) \ + ((__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df)(__m256d) (X),\ + (int) (C), (__v2df)(__m128d) _mm_setzero_pd(), (__mmask8)-1)) + +#define _mm256_mask_extractf64x2_pd(W, U, X, C) \ + ((__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df)(__m256d) (X),\ + (int) (C), (__v2df)(__m128d) (W), (__mmask8) (U))) + +#define _mm256_maskz_extractf64x2_pd(U, X, C) \ + ((__m128d) __builtin_ia32_extractf64x2_256_mask ((__v4df)(__m256d) (X),\ + (int) (C), (__v2df)(__m128d) _mm_setzero_pd(), (__mmask8) (U))) + +#define _mm256_extracti64x2_epi64(X, C) \ + ((__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di)(__m256i) (X),\ + (int) (C), (__v2di)(__m128i) _mm_setzero_si128 (), (__mmask8)-1)) + +#define _mm256_mask_extracti64x2_epi64(W, U, X, C) \ + ((__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di)(__m256i) (X),\ + (int) (C), (__v2di)(__m128i) (W), (__mmask8) (U))) + +#define _mm256_maskz_extracti64x2_epi64(U, X, C) \ + ((__m128i) __builtin_ia32_extracti64x2_256_mask ((__v4di)(__m256i) (X),\ + (int) (C), (__v2di)(__m128i) _mm_setzero_si128 (), (__mmask8) (U))) + +#define _mm256_reduce_pd(A, B) \ + ((__m256d) __builtin_ia32_reducepd256_mask ((__v4df)(__m256d)(A), \ + (int)(B), (__v4df)_mm256_setzero_pd(), (__mmask8)-1)) + +#define _mm256_mask_reduce_pd(W, U, A, B) \ + ((__m256d) __builtin_ia32_reducepd256_mask ((__v4df)(__m256d)(A), \ + (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U))) + +#define _mm256_maskz_reduce_pd(U, A, B) \ + ((__m256d) __builtin_ia32_reducepd256_mask ((__v4df)(__m256d)(A), \ + (int)(B), (__v4df)_mm256_setzero_pd(), (__mmask8)(U))) + +#define _mm_reduce_pd(A, B) \ + ((__m128d) __builtin_ia32_reducepd128_mask ((__v2df)(__m128d)(A), \ + (int)(B), (__v2df)_mm_setzero_pd(), (__mmask8)-1)) + +#define _mm_mask_reduce_pd(W, U, A, B) \ + ((__m128d) __builtin_ia32_reducepd128_mask ((__v2df)(__m128d)(A), \ + (int)(B), (__v2df)(__m128d)(W), (__mmask8)(U))) + +#define _mm_maskz_reduce_pd(U, A, B) \ + ((__m128d) __builtin_ia32_reducepd128_mask ((__v2df)(__m128d)(A), \ + (int)(B), (__v2df)_mm_setzero_pd(), (__mmask8)(U))) + +#define _mm256_reduce_ps(A, B) \ + ((__m256) __builtin_ia32_reduceps256_mask ((__v8sf)(__m256)(A), \ + (int)(B), (__v8sf)_mm256_setzero_ps(), (__mmask8)-1)) + +#define _mm256_mask_reduce_ps(W, U, A, B) \ + ((__m256) __builtin_ia32_reduceps256_mask ((__v8sf)(__m256)(A), \ + (int)(B), (__v8sf)(__m256)(W), (__mmask8)(U))) + +#define _mm256_maskz_reduce_ps(U, A, B) \ + ((__m256) __builtin_ia32_reduceps256_mask ((__v8sf)(__m256)(A), \ + (int)(B), (__v8sf)_mm256_setzero_ps(), (__mmask8)(U))) + +#define _mm_reduce_ps(A, B) \ + ((__m128) __builtin_ia32_reduceps128_mask ((__v4sf)(__m128)(A), \ + (int)(B), (__v4sf)_mm_setzero_ps(), (__mmask8)-1)) + +#define _mm_mask_reduce_ps(W, U, A, B) \ + ((__m128) __builtin_ia32_reduceps128_mask ((__v4sf)(__m128)(A), \ + (int)(B), (__v4sf)(__m128)(W), (__mmask8)(U))) + +#define _mm_maskz_reduce_ps(U, A, B) \ + ((__m128) __builtin_ia32_reduceps128_mask ((__v4sf)(__m128)(A), \ + (int)(B), (__v4sf)_mm_setzero_ps(), (__mmask8)(U))) + +#define _mm256_range_pd(A, B, C) \ + ((__m256d) __builtin_ia32_rangepd256_mask ((__v4df)(__m256d)(A), \ + (__v4df)(__m256d)(B), (int)(C), \ + (__v4df)_mm256_setzero_pd(), (__mmask8)-1)) + +#define _mm256_maskz_range_pd(U, A, B, C) \ + ((__m256d) __builtin_ia32_rangepd256_mask ((__v4df)(__m256d)(A), \ + (__v4df)(__m256d)(B), (int)(C), \ + (__v4df)_mm256_setzero_pd(), (__mmask8)(U))) + +#define _mm_range_pd(A, B, C) \ + ((__m128d) __builtin_ia32_rangepd128_mask ((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)_mm_setzero_pd(), (__mmask8)-1)) + +#define _mm256_range_ps(A, B, C) \ + ((__m256) __builtin_ia32_rangeps256_mask ((__v8sf)(__m256)(A), \ + (__v8sf)(__m256)(B), (int)(C), \ + (__v8sf)_mm256_setzero_ps(), (__mmask8)-1)) + +#define _mm256_mask_range_ps(W, U, A, B, C) \ + ((__m256) __builtin_ia32_rangeps256_mask ((__v8sf)(__m256)(A), \ + (__v8sf)(__m256)(B), (int)(C), \ + (__v8sf)(__m256)(W), (__mmask8)(U))) + +#define _mm256_maskz_range_ps(U, A, B, C) \ + ((__m256) __builtin_ia32_rangeps256_mask ((__v8sf)(__m256)(A), \ + (__v8sf)(__m256)(B), (int)(C), \ + (__v8sf)_mm256_setzero_ps(), (__mmask8)(U))) + +#define _mm_range_ps(A, B, C) \ + ((__m128) __builtin_ia32_rangeps128_mask ((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), \ + (__v4sf)_mm_setzero_ps(), (__mmask8)-1)) + +#define _mm_mask_range_ps(W, U, A, B, C) \ + ((__m128) __builtin_ia32_rangeps128_mask ((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), \ + (__v4sf)(__m128)(W), (__mmask8)(U))) + +#define _mm_maskz_range_ps(U, A, B, C) \ + ((__m128) __builtin_ia32_rangeps128_mask ((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), \ + (__v4sf)_mm_setzero_ps(), (__mmask8)(U))) + +#define _mm256_mask_range_pd(W, U, A, B, C) \ + ((__m256d) __builtin_ia32_rangepd256_mask ((__v4df)(__m256d)(A), \ + (__v4df)(__m256d)(B), (int)(C), \ + (__v4df)(__m256d)(W), (__mmask8)(U))) + +#define _mm_mask_range_pd(W, U, A, B, C) \ + ((__m128d) __builtin_ia32_rangepd128_mask ((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)(__m128d)(W), (__mmask8)(U))) + +#define _mm_maskz_range_pd(U, A, B, C) \ + ((__m128d) __builtin_ia32_rangepd128_mask ((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)_mm_setzero_pd(), (__mmask8)(U))) + +#define _mm256_mask_fpclass_pd_mask(u, X, C) \ + ((__mmask8) __builtin_ia32_fpclasspd256_mask ((__v4df) (__m256d) (X), \ + (int) (C),(__mmask8)(u))) + +#define _mm256_mask_fpclass_ps_mask(u, X, C) \ + ((__mmask8) __builtin_ia32_fpclassps256_mask ((__v8sf) (__m256) (X), \ + (int) (C),(__mmask8)(u))) + +#define _mm_mask_fpclass_pd_mask(u, X, C) \ + ((__mmask8) __builtin_ia32_fpclasspd128_mask ((__v2df) (__m128d) (X), \ + (int) (C),(__mmask8)(u))) + +#define _mm_mask_fpclass_ps_mask(u, X, C) \ + ((__mmask8) __builtin_ia32_fpclassps128_mask ((__v4sf) (__m128) (X), \ + (int) (C),(__mmask8)(u))) + +#define _mm256_fpclass_pd_mask(X, C) \ + ((__mmask8) __builtin_ia32_fpclasspd256_mask ((__v4df) (__m256d) (X), \ + (int) (C),(__mmask8)-1)) + +#define _mm256_fpclass_ps_mask(X, C) \ + ((__mmask8) __builtin_ia32_fpclassps256_mask ((__v8sf) (__m256) (X), \ + (int) (C),(__mmask8)-1)) + +#define _mm_fpclass_pd_mask(X, C) \ + ((__mmask8) __builtin_ia32_fpclasspd128_mask ((__v2df) (__m128d) (X), \ + (int) (C),(__mmask8)-1)) + +#define _mm_fpclass_ps_mask(X, C) \ + ((__mmask8) __builtin_ia32_fpclassps128_mask ((__v4sf) (__m128) (X), \ + (int) (C),(__mmask8)-1)) + +#endif + +#ifdef __DISABLE_AVX512VLDQ__ +#undef __DISABLE_AVX512VLDQ__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512VLDQ__ */ + +#endif /* _AVX512VLDQINTRIN_H_INCLUDED */ diff --git a/include-gcc/avx512vlintrin.h b/include-gcc/avx512vlintrin.h new file mode 100644 index 0000000..758b71a --- /dev/null +++ b/include-gcc/avx512vlintrin.h @@ -0,0 +1,13896 @@ +/* Copyright (C) 2014-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512VLINTRIN_H_INCLUDED +#define _AVX512VLINTRIN_H_INCLUDED + +#ifndef __AVX512VL__ +#pragma GCC push_options +#pragma GCC target("avx512vl") +#define __DISABLE_AVX512VL__ +#endif /* __AVX512VL__ */ + +/* Internal data types for implementing the intrinsics. */ +typedef unsigned int __mmask32; +typedef int __v4si_u __attribute__ ((__vector_size__ (16), \ + __may_alias__, __aligned__ (1))); +typedef int __v8si_u __attribute__ ((__vector_size__ (32), \ + __may_alias__, __aligned__ (1))); +typedef long long __v2di_u __attribute__ ((__vector_size__ (16), \ + __may_alias__, __aligned__ (1))); +typedef long long __v4di_u __attribute__ ((__vector_size__ (32), \ + __may_alias__, __aligned__ (1))); + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mov_pd (__m256d __W, __mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_movapd256_mask ((__v4df) __A, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mov_pd (__mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_movapd256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mov_pd (__m128d __W, __mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_movapd128_mask ((__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mov_pd (__mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_movapd128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_load_pd (__m256d __W, __mmask8 __U, void const *__P) +{ + return (__m256d) __builtin_ia32_loadapd256_mask ((__v4df *) __P, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_load_pd (__mmask8 __U, void const *__P) +{ + return (__m256d) __builtin_ia32_loadapd256_mask ((__v4df *) __P, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_load_pd (__m128d __W, __mmask8 __U, void const *__P) +{ + return (__m128d) __builtin_ia32_loadapd128_mask ((__v2df *) __P, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_load_pd (__mmask8 __U, void const *__P) +{ + return (__m128d) __builtin_ia32_loadapd128_mask ((__v2df *) __P, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_store_pd (void *__P, __mmask8 __U, __m256d __A) +{ + __builtin_ia32_storeapd256_mask ((__v4df *) __P, + (__v4df) __A, + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_store_pd (void *__P, __mmask8 __U, __m128d __A) +{ + __builtin_ia32_storeapd128_mask ((__v2df *) __P, + (__v2df) __A, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mov_ps (__m256 __W, __mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_movaps256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mov_ps (__mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_movaps256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mov_ps (__m128 __W, __mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_movaps128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mov_ps (__mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_movaps128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_load_ps (__m256 __W, __mmask8 __U, void const *__P) +{ + return (__m256) __builtin_ia32_loadaps256_mask ((__v8sf *) __P, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_load_ps (__mmask8 __U, void const *__P) +{ + return (__m256) __builtin_ia32_loadaps256_mask ((__v8sf *) __P, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_load_ps (__m128 __W, __mmask8 __U, void const *__P) +{ + return (__m128) __builtin_ia32_loadaps128_mask ((__v4sf *) __P, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_load_ps (__mmask8 __U, void const *__P) +{ + return (__m128) __builtin_ia32_loadaps128_mask ((__v4sf *) __P, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_store_ps (void *__P, __mmask8 __U, __m256 __A) +{ + __builtin_ia32_storeaps256_mask ((__v8sf *) __P, + (__v8sf) __A, + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_store_ps (void *__P, __mmask8 __U, __m128 __A) +{ + __builtin_ia32_storeaps128_mask ((__v4sf *) __P, + (__v4sf) __A, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mov_epi64 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_movdqa64_256_mask ((__v4di) __A, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mov_epi64 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_movdqa64_256_mask ((__v4di) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mov_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_movdqa64_128_mask ((__v2di) __A, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mov_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_movdqa64_128_mask ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_load_epi64 (void const *__P) +{ + return (__m256i) (*(__v4di *) __P); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_load_epi64 (__m256i __W, __mmask8 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_movdqa64load256_mask ((__v4di *) __P, + (__v4di) __W, + (__mmask8) + __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_load_epi64 (__mmask8 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_movdqa64load256_mask ((__v4di *) __P, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) + __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_epi64 (void const *__P) +{ + return (__m128i) (*(__v2di *) __P); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_load_epi64 (__m128i __W, __mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_movdqa64load128_mask ((__v2di *) __P, + (__v2di) __W, + (__mmask8) + __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_load_epi64 (__mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_movdqa64load128_mask ((__v2di *) __P, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) + __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_store_epi64 (void *__P, __mmask8 __U, __m256i __A) +{ + __builtin_ia32_movdqa64store256_mask ((__v4di *) __P, + (__v4di) __A, + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_store_epi64 (void *__P, __mmask8 __U, __m128i __A) +{ + __builtin_ia32_movdqa64store128_mask ((__v2di *) __P, + (__v2di) __A, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mov_epi32 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_movdqa32_256_mask ((__v8si) __A, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mov_epi32 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_movdqa32_256_mask ((__v8si) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mov_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_movdqa32_128_mask ((__v4si) __A, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mov_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_movdqa32_128_mask ((__v4si) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_load_epi32 (void const *__P) +{ + return (__m256i) (*(__v8si *) __P); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_load_epi32 (__m256i __W, __mmask8 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_movdqa32load256_mask ((__v8si *) __P, + (__v8si) __W, + (__mmask8) + __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_load_epi32 (__mmask8 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_movdqa32load256_mask ((__v8si *) __P, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) + __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_epi32 (void const *__P) +{ + return (__m128i) (*(__v4si *) __P); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_load_epi32 (__m128i __W, __mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_movdqa32load128_mask ((__v4si *) __P, + (__v4si) __W, + (__mmask8) + __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_load_epi32 (__mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_movdqa32load128_mask ((__v4si *) __P, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) + __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_store_epi32 (void *__P, __m256i __A) +{ + *(__v8si *) __P = (__v8si) __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_store_epi32 (void *__P, __mmask8 __U, __m256i __A) +{ + __builtin_ia32_movdqa32store256_mask ((__v8si *) __P, + (__v8si) __A, + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_epi32 (void *__P, __m128i __A) +{ + *(__v4si *) __P = (__v4si) __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_store_epi32 (void *__P, __mmask8 __U, __m128i __A) +{ + __builtin_ia32_movdqa32store128_mask ((__v4si *) __P, + (__v4si) __A, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_addpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_pd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_addpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_add_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) +{ + return (__m256d) __builtin_ia32_addpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_add_pd (__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_addpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_addps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_addps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_add_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_add_ps (__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_addps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_subpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_pd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_subpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sub_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) +{ + return (__m256d) __builtin_ia32_subpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sub_pd (__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_subpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_subps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_subps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sub_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sub_ps (__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_subps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_store_epi64 (void *__P, __m256i __A) +{ + *(__m256i *) __P = __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_epi64 (void *__P, __m128i __A) +{ + *(__m128i *) __P = __A; +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_loadu_pd (__m256d __W, __mmask8 __U, void const *__P) +{ + return (__m256d) __builtin_ia32_loadupd256_mask ((const double *) __P, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_loadu_pd (__mmask8 __U, void const *__P) +{ + return (__m256d) __builtin_ia32_loadupd256_mask ((const double *) __P, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_loadu_pd (__m128d __W, __mmask8 __U, void const *__P) +{ + return (__m128d) __builtin_ia32_loadupd128_mask ((const double *) __P, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_loadu_pd (__mmask8 __U, void const *__P) +{ + return (__m128d) __builtin_ia32_loadupd128_mask ((const double *) __P, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_storeu_pd (void *__P, __mmask8 __U, __m256d __A) +{ + __builtin_ia32_storeupd256_mask ((double *) __P, + (__v4df) __A, + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_storeu_pd (void *__P, __mmask8 __U, __m128d __A) +{ + __builtin_ia32_storeupd128_mask ((double *) __P, + (__v2df) __A, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_loadu_ps (__m256 __W, __mmask8 __U, void const *__P) +{ + return (__m256) __builtin_ia32_loadups256_mask ((const float *) __P, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_loadu_ps (__mmask8 __U, void const *__P) +{ + return (__m256) __builtin_ia32_loadups256_mask ((const float *) __P, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_loadu_ps (__m128 __W, __mmask8 __U, void const *__P) +{ + return (__m128) __builtin_ia32_loadups128_mask ((const float *) __P, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_loadu_ps (__mmask8 __U, void const *__P) +{ + return (__m128) __builtin_ia32_loadups128_mask ((const float *) __P, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_storeu_ps (void *__P, __mmask8 __U, __m256 __A) +{ + __builtin_ia32_storeups256_mask ((float *) __P, + (__v8sf) __A, + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_storeu_ps (void *__P, __mmask8 __U, __m128 __A) +{ + __builtin_ia32_storeups128_mask ((float *) __P, + (__v4sf) __A, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu_epi64 (void const *__P) +{ + return (__m256i) (*(__v4di_u *) __P); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_loadu_epi64 (__m256i __W, __mmask8 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_loaddqudi256_mask ((const long long *) __P, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_loadu_epi64 (__mmask8 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_loaddqudi256_mask ((const long long *) __P, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_epi64 (void const *__P) +{ + return (__m128i) (*(__v2di_u *) __P); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_loadu_epi64 (__m128i __W, __mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_loaddqudi128_mask ((const long long *) __P, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_loadu_epi64 (__mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_loaddqudi128_mask ((const long long *) __P, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_storeu_epi64 (void *__P, __m256i __A) +{ + *(__m256i_u *) __P = (__m256i_u) __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_storeu_epi64 (void *__P, __mmask8 __U, __m256i __A) +{ + __builtin_ia32_storedqudi256_mask ((long long *) __P, + (__v4di) __A, + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_epi64 (void *__P, __m128i __A) +{ + *(__m128i_u *) __P = (__m128i_u) __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_storeu_epi64 (void *__P, __mmask8 __U, __m128i __A) +{ + __builtin_ia32_storedqudi128_mask ((long long *) __P, + (__v2di) __A, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu_epi32 (void const *__P) +{ + return (__m256i) (*(__v8si_u *) __P); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_loadu_epi32 (__m256i __W, __mmask8 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_loaddqusi256_mask ((const int *) __P, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_loadu_epi32 (__mmask8 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_loaddqusi256_mask ((const int *) __P, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_epi32 (void const *__P) +{ + return (__m128i) (*(__v4si_u *) __P); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_loadu_epi32 (__m128i __W, __mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_loaddqusi128_mask ((const int *) __P, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_loadu_epi32 (__mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_loaddqusi128_mask ((const int *) __P, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_storeu_epi32 (void *__P, __m256i __A) +{ + *(__m256i_u *) __P = (__m256i_u) __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_storeu_epi32 (void *__P, __mmask8 __U, __m256i __A) +{ + __builtin_ia32_storedqusi256_mask ((int *) __P, + (__v8si) __A, + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_epi32 (void *__P, __m128i __A) +{ + *(__m128i_u *) __P = (__m128i_u) __A; +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_storeu_epi32 (void *__P, __mmask8 __U, __m128i __A) +{ + __builtin_ia32_storedqusi128_mask ((int *) __P, + (__v4si) __A, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_abs_epi32 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_pabsd256_mask ((__v8si) __A, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_abs_epi32 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_pabsd256_mask ((__v8si) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_abs_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pabsd128_mask ((__v4si) __A, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_abs_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pabsd128_mask ((__v4si) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_abs_epi64 (__m256i __A) +{ + return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_abs_epi64 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_abs_epi64 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_pabsq256_mask ((__v4di) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_epi64 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_abs_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_abs_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pabsq128_mask ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtpd_epu32 (__m256d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtpd_epu32 (__mmask8 __U, __m256d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpd_epu32 (__m128d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtpd_epu32 (__mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvttps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) +{ + return (__m256i) __builtin_ia32_cvttps2dq256_mask ((__v8sf) __A, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvttps_epi32 (__mmask8 __U, __m256 __A) +{ + return (__m256i) __builtin_ia32_cvttps2dq256_mask ((__v8sf) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvttps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvttps2dq128_mask ((__v4sf) __A, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvttps_epi32 (__mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvttps2dq128_mask ((__v4sf) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvttps_epu32 (__m256 __A) +{ + return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvttps_epu32 (__m256i __W, __mmask8 __U, __m256 __A) +{ + return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvttps_epu32 (__mmask8 __U, __m256 __A) +{ + return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttps_epu32 (__m128 __A) +{ + return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvttps_epu32 (__m128i __W, __mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvttps_epu32 (__mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) +{ + return (__m128i) __builtin_ia32_cvttpd2dq256_mask ((__v4df) __A, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvttpd_epi32 (__mmask8 __U, __m256d __A) +{ + return (__m128i) __builtin_ia32_cvttpd2dq256_mask ((__v4df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvttpd_epi32 (__mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvttpd_epu32 (__m256d __A) +{ + return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A) +{ + return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvttpd_epu32 (__mmask8 __U, __m256d __A) +{ + return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttpd_epu32 (__m128d __A) +{ + return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvttpd_epu32 (__mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2dq256_mask ((__v4df) __A, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtpd_epi32 (__mmask8 __U, __m256d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2dq256_mask ((__v4df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtpd_epi32 (__mmask8 __U, __m128d __A) +{ + return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi32_pd (__m256d __W, __mmask8 __U, __m128i __A) +{ + return (__m256d) __builtin_ia32_cvtdq2pd256_mask ((__v4si) __A, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) +{ + return (__m256d) __builtin_ia32_cvtdq2pd256_mask ((__v4si) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi32_pd (__m128d __W, __mmask8 __U, __m128i __A) +{ + return (__m128d) __builtin_ia32_cvtdq2pd128_mask ((__v4si) __A, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) +{ + return (__m128d) __builtin_ia32_cvtdq2pd128_mask ((__v4si) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu32_pd (__m128i __A) +{ + return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepu32_pd (__m256d __W, __mmask8 __U, __m128i __A) +{ + return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) +{ + return (__m256d) __builtin_ia32_cvtudq2pd256_mask ((__v4si) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu32_pd (__m128i __A) +{ + return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepu32_pd (__m128d __W, __mmask8 __U, __m128i __A) +{ + return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) +{ + return (__m128d) __builtin_ia32_cvtudq2pd128_mask ((__v4si) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi32_ps (__m256 __W, __mmask8 __U, __m256i __A) +{ + return (__m256) __builtin_ia32_cvtdq2ps256_mask ((__v8si) __A, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi32_ps (__mmask8 __U, __m256i __A) +{ + return (__m256) __builtin_ia32_cvtdq2ps256_mask ((__v8si) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi32_ps (__m128 __W, __mmask8 __U, __m128i __A) +{ + return (__m128) __builtin_ia32_cvtdq2ps128_mask ((__v4si) __A, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi32_ps (__mmask8 __U, __m128i __A) +{ + return (__m128) __builtin_ia32_cvtdq2ps128_mask ((__v4si) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepu32_ps (__m256i __A) +{ + return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepu32_ps (__m256 __W, __mmask8 __U, __m256i __A) +{ + return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepu32_ps (__mmask8 __U, __m256i __A) +{ + return (__m256) __builtin_ia32_cvtudq2ps256_mask ((__v8si) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu32_ps (__m128i __A) +{ + return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepu32_ps (__m128 __W, __mmask8 __U, __m128i __A) +{ + return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepu32_ps (__mmask8 __U, __m128i __A) +{ + return (__m128) __builtin_ia32_cvtudq2ps128_mask ((__v4si) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtps_pd (__m256d __W, __mmask8 __U, __m128 __A) +{ + return (__m256d) __builtin_ia32_cvtps2pd256_mask ((__v4sf) __A, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtps_pd (__mmask8 __U, __m128 __A) +{ + return (__m256d) __builtin_ia32_cvtps2pd256_mask ((__v4sf) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtps_pd (__m128d __W, __mmask8 __U, __m128 __A) +{ + return (__m128d) __builtin_ia32_cvtps2pd128_mask ((__v4sf) __A, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtps_pd (__mmask8 __U, __m128 __A) +{ + return (__m128d) __builtin_ia32_cvtps2pd128_mask ((__v4sf) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi32_epi8 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovdb128mem_mask ((unsigned int *) __P, (__v4si) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A, + (__v16qi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi32_epi8 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi32_epi8 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A, + (__v16qi) __O, __M); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovdb256mem_mask ((unsigned long long *) __P, (__v8si) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi32_epi8 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsepi32_epi8 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovsdb128mem_mask ((unsigned int *) __P, (__v4si) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A, + (__v16qi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtsepi32_epi8 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtsepi32_epi8 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovsdb256mem_mask ((unsigned long long *) __P, (__v8si) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A, + (__v16qi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtsepi32_epi8 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtusepi32_epi8 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovusdb128mem_mask ((unsigned int *) __P, (__v4si) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A, + (__v16qi) __O, + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtusepi32_epi8 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtusepi32_epi8 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovusdb256mem_mask ((unsigned long long *) __P, (__v8si) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A, + (__v16qi) __O, + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtusepi32_epi8 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi32_epi16 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovdw128mem_mask ((unsigned long long *) __P, (__v4si) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A, + (__v8hi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi32_epi16 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi32_epi16 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A, + (__v8hi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi32_epi16 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsepi32_epi16 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovsdw128mem_mask ((unsigned long long *) __P, (__v4si) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A, + (__v8hi)__O, + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtsepi32_epi16 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtsepi32_epi16 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovsdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A, + (__v8hi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtsepi32_epi16 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtusepi32_epi16 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovusdw128mem_mask ((unsigned long long *) __P, (__v4si) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A, + (__v8hi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtusepi32_epi16 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtusepi32_epi16 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovusdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A, + (__v8hi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtusepi32_epi16 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi64_epi8 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovqb128mem_mask ((unsigned short *) __P, (__v2di) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A, + (__v16qi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi64_epi8 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi64_epi8 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovqb256mem_mask ((unsigned int *) __P, (__v4di) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A, + (__v16qi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi64_epi8 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsepi64_epi8 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovsqb128mem_mask ((unsigned short *) __P, (__v2di) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A, + (__v16qi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtsepi64_epi8 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtsepi64_epi8 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovsqb256mem_mask ((unsigned int *) __P, (__v4di) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A, + (__v16qi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtsepi64_epi8 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtusepi64_epi8 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovusqb128mem_mask ((unsigned short *) __P, (__v2di) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A, + (__v16qi) __O, + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtusepi64_epi8 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtusepi64_epi8 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A, + (__v16qi) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovusqb256mem_mask ((unsigned int *) __P, (__v4di) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A, + (__v16qi) __O, + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtusepi64_epi8 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi64_epi16 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovqw128mem_mask ((unsigned int *) __P, (__v2di) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A, + (__v8hi)__O, + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi64_epi16 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi64_epi16 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovqw256mem_mask ((unsigned long long *) __P, (__v4di) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A, + (__v8hi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi64_epi16 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsepi64_epi16 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovsqw128mem_mask ((unsigned int *) __P, (__v2di) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A, + (__v8hi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtsepi64_epi16 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtsepi64_epi16 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovsqw256mem_mask ((unsigned long long *) __P, (__v4di) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A, + (__v8hi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtsepi64_epi16 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtusepi64_epi16 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovusqw128mem_mask ((unsigned int *) __P, (__v2di) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A, + (__v8hi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtusepi64_epi16 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtusepi64_epi16 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A, + (__v8hi) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovusqw256mem_mask ((unsigned long long *) __P, (__v4di) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A, + (__v8hi) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtusepi64_epi16 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A, + (__v8hi) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi64_epi32 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A, + (__v4si) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovqd128mem_mask ((unsigned long long *) __P, + (__v2di) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A, + (__v4si) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi64_epi32 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A, + (__v4si) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi64_epi32 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A, + (__v4si) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A, + (__v4si) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi64_epi32 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovqd256_mask ((__v4di) __A, + (__v4si) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsepi64_epi32 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A, + (__v4si) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovsqd128mem_mask ((unsigned long long *) __P, (__v2di) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A, + (__v4si) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtsepi64_epi32 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A, + (__v4si) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtsepi64_epi32 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A, + (__v4si) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovsqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A, + (__v4si)__O, + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtsepi64_epi32 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A, + (__v4si) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtusepi64_epi32 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A, + (__v4si) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovusqd128mem_mask ((unsigned long long *) __P, (__v2di) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A, + (__v4si) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtusepi64_epi32 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A, + (__v4si) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtusepi64_epi32 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A, + (__v4si) + _mm_undefined_si128 (), + (__mmask8) -1); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovusqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A, + (__v4si) __O, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtusepi64_epi32 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A, + (__v4si) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_broadcastss_ps (__m256 __O, __mmask8 __M, __m128 __A) +{ + return (__m256) __builtin_ia32_broadcastss256_mask ((__v4sf) __A, + (__v8sf) __O, + __M); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_broadcastss_ps (__mmask8 __M, __m128 __A) +{ + return (__m256) __builtin_ia32_broadcastss256_mask ((__v4sf) __A, + (__v8sf) + _mm256_setzero_ps (), + __M); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_broadcastss_ps (__m128 __O, __mmask8 __M, __m128 __A) +{ + return (__m128) __builtin_ia32_broadcastss128_mask ((__v4sf) __A, + (__v4sf) __O, + __M); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_broadcastss_ps (__mmask8 __M, __m128 __A) +{ + return (__m128) __builtin_ia32_broadcastss128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + __M); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_broadcastsd_pd (__m256d __O, __mmask8 __M, __m128d __A) +{ + return (__m256d) __builtin_ia32_broadcastsd256_mask ((__v2df) __A, + (__v4df) __O, + __M); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A) +{ + return (__m256d) __builtin_ia32_broadcastsd256_mask ((__v2df) __A, + (__v4df) + _mm256_setzero_pd (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_broadcastd_epi32 (__m256i __O, __mmask8 __M, __m128i __A) +{ + return (__m256i) __builtin_ia32_pbroadcastd256_mask ((__v4si) __A, + (__v8si) __O, + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A) +{ + return (__m256i) __builtin_ia32_pbroadcastd256_mask ((__v4si) __A, + (__v8si) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_set1_epi32 (__m256i __O, __mmask8 __M, int __A) +{ + return (__m256i) __builtin_ia32_pbroadcastd256_gpr_mask (__A, (__v8si) __O, + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_set1_epi32 (__mmask8 __M, int __A) +{ + return (__m256i) __builtin_ia32_pbroadcastd256_gpr_mask (__A, + (__v8si) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_broadcastd_epi32 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pbroadcastd128_mask ((__v4si) __A, + (__v4si) __O, + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pbroadcastd128_mask ((__v4si) __A, + (__v4si) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_set1_epi32 (__m128i __O, __mmask8 __M, int __A) +{ + return (__m128i) __builtin_ia32_pbroadcastd128_gpr_mask (__A, (__v4si) __O, + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_set1_epi32 (__mmask8 __M, int __A) +{ + return (__m128i) + __builtin_ia32_pbroadcastd128_gpr_mask (__A, + (__v4si) _mm_setzero_si128 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_broadcastq_epi64 (__m256i __O, __mmask8 __M, __m128i __A) +{ + return (__m256i) __builtin_ia32_pbroadcastq256_mask ((__v2di) __A, + (__v4di) __O, + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) +{ + return (__m256i) __builtin_ia32_pbroadcastq256_mask ((__v2di) __A, + (__v4di) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_set1_epi64 (__m256i __O, __mmask8 __M, long long __A) +{ + return (__m256i) __builtin_ia32_pbroadcastq256_gpr_mask (__A, (__v4di) __O, + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_set1_epi64 (__mmask8 __M, long long __A) +{ + return (__m256i) __builtin_ia32_pbroadcastq256_gpr_mask (__A, + (__v4di) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_broadcastq_epi64 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pbroadcastq128_mask ((__v2di) __A, + (__v2di) __O, + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pbroadcastq128_mask ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A) +{ + return (__m128i) __builtin_ia32_pbroadcastq128_gpr_mask (__A, (__v2di) __O, + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_set1_epi64 (__mmask8 __M, long long __A) +{ + return (__m128i) + __builtin_ia32_pbroadcastq128_gpr_mask (__A, + (__v2di) _mm_setzero_si128 (), + __M); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcast_f32x4 (__m128 __A) +{ + return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A, + (__v8sf)_mm256_undefined_pd (), + (__mmask8) -1); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_broadcast_f32x4 (__m256 __O, __mmask8 __M, __m128 __A) +{ + return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A, + (__v8sf) __O, + __M); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_broadcast_f32x4 (__mmask8 __M, __m128 __A) +{ + return (__m256) __builtin_ia32_broadcastf32x4_256_mask ((__v4sf) __A, + (__v8sf) + _mm256_setzero_ps (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcast_i32x4 (__m128i __A) +{ + return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) + __A, + (__v8si)_mm256_undefined_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_broadcast_i32x4 (__m256i __O, __mmask8 __M, __m128i __A) +{ + return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) + __A, + (__v8si) + __O, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_broadcast_i32x4 (__mmask8 __M, __m128i __A) +{ + return (__m256i) __builtin_ia32_broadcasti32x4_256_mask ((__v4si) + __A, + (__v8si) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi8_epi32 (__m256i __W, __mmask8 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovsxbd256_mask ((__v16qi) __A, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovsxbd256_mask ((__v16qi) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi8_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsxbd128_mask ((__v16qi) __A, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsxbd128_mask ((__v16qi) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi8_epi64 (__m256i __W, __mmask8 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovsxbq256_mask ((__v16qi) __A, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovsxbq256_mask ((__v16qi) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi8_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsxbq128_mask ((__v16qi) __A, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi8_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsxbq128_mask ((__v16qi) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi16_epi32 (__m256i __W, __mmask8 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovsxwd256_mask ((__v8hi) __A, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovsxwd256_mask ((__v8hi) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi16_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsxwd128_mask ((__v8hi) __A, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsxwd128_mask ((__v8hi) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi16_epi64 (__m256i __W, __mmask8 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovsxwq256_mask ((__v8hi) __A, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovsxwq256_mask ((__v8hi) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi16_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsxwq128_mask ((__v8hi) __A, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi16_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsxwq128_mask ((__v8hi) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepi32_epi64 (__m256i __W, __mmask8 __U, __m128i __X) +{ + return (__m256i) __builtin_ia32_pmovsxdq256_mask ((__v4si) __X, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepi32_epi64 (__mmask8 __U, __m128i __X) +{ + return (__m256i) __builtin_ia32_pmovsxdq256_mask ((__v4si) __X, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepi32_epi64 (__m128i __W, __mmask8 __U, __m128i __X) +{ + return (__m128i) __builtin_ia32_pmovsxdq128_mask ((__v4si) __X, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepi32_epi64 (__mmask8 __U, __m128i __X) +{ + return (__m128i) __builtin_ia32_pmovsxdq128_mask ((__v4si) __X, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepu8_epi32 (__m256i __W, __mmask8 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovzxbd256_mask ((__v16qi) __A, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepu8_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovzxbd256_mask ((__v16qi) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepu8_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovzxbd128_mask ((__v16qi) __A, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepu8_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovzxbd128_mask ((__v16qi) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepu8_epi64 (__m256i __W, __mmask8 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovzxbq256_mask ((__v16qi) __A, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovzxbq256_mask ((__v16qi) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepu8_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovzxbq128_mask ((__v16qi) __A, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovzxbq128_mask ((__v16qi) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepu16_epi32 (__m256i __W, __mmask8 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovzxwd256_mask ((__v8hi) __A, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepu16_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovzxwd256_mask ((__v8hi) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepu16_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovzxwd128_mask ((__v8hi) __A, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepu16_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovzxwd128_mask ((__v8hi) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepu16_epi64 (__m256i __W, __mmask8 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovzxwq256_mask ((__v8hi) __A, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m256i) __builtin_ia32_pmovzxwq256_mask ((__v8hi) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepu16_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovzxwq128_mask ((__v8hi) __A, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepu16_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovzxwq128_mask ((__v8hi) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtepu32_epi64 (__m256i __W, __mmask8 __U, __m128i __X) +{ + return (__m256i) __builtin_ia32_pmovzxdq256_mask ((__v4si) __X, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtepu32_epi64 (__mmask8 __U, __m128i __X) +{ + return (__m256i) __builtin_ia32_pmovzxdq256_mask ((__v4si) __X, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtepu32_epi64 (__m128i __W, __mmask8 __U, __m128i __X) +{ + return (__m128i) __builtin_ia32_pmovzxdq128_mask ((__v4si) __X, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtepu32_epi64 (__mmask8 __U, __m128i __X) +{ + return (__m128i) __builtin_ia32_pmovzxdq128_mask ((__v4si) __X, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rcp14_pd (__m256d __A) +{ + return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rcp14_pd (__m256d __W, __mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_rcp14_pd (__mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rcp14_pd (__m128d __A) +{ + return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rcp14_pd (__m128d __W, __mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rcp14_pd (__mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rcp14_ps (__m256 __A) +{ + return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rcp14_ps (__m256 __W, __mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_rcp14_ps (__mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rcp14_ps (__m128 __A) +{ + return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rcp14_ps (__m128 __W, __mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rcp14_ps (__mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rsqrt14_pd (__m256d __A) +{ + return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rsqrt14_pd (__m256d __W, __mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_rsqrt14_pd (__mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rsqrt14_pd (__m128d __A) +{ + return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rsqrt14_pd (__m128d __W, __mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rsqrt14_pd (__mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rsqrt14_ps (__m256 __A) +{ + return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rsqrt14_ps (__m256 __W, __mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_rsqrt14_ps (__mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rsqrt14_ps (__m128 __A) +{ + return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rsqrt14_ps (__m128 __W, __mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rsqrt14_ps (__mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sqrt_pd (__m256d __W, __mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_sqrtpd256_mask ((__v4df) __A, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sqrt_pd (__mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_sqrtpd256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sqrt_pd (__m128d __W, __mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_sqrtpd128_mask ((__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sqrt_pd (__mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_sqrtpd128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sqrt_ps (__m256 __W, __mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_sqrtps256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sqrt_ps (__mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_sqrtps256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sqrt_ps (__m128 __W, __mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_sqrtps128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sqrt_ps (__mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_sqrtps128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_add_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_paddd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_add_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_paddd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_add_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_paddq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_add_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_paddq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sub_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_psubd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sub_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_psubd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sub_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_psubq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sub_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_psubq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_paddd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_paddd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_add_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_paddq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_add_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_paddq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psubd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psubd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sub_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psubq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sub_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psubq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_getexp_ps (__m256 __A) +{ + return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_getexp_ps (__m256 __W, __mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_getexp_ps (__mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_getexp_pd (__m256d __A) +{ + return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_getexp_pd (__m256d __W, __mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_getexp_pd (__mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getexp_ps (__m128 __A) +{ + return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getexp_ps (__m128 __W, __mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getexp_ps (__mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getexp_pd (__m128d __A) +{ + return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getexp_pd (__m128d __W, __mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getexp_pd (__mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srl_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m128i __B) +{ + return (__m256i) __builtin_ia32_psrld256_mask ((__v8si) __A, + (__v4si) __B, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srl_epi32 (__mmask8 __U, __m256i __A, __m128i __B) +{ + return (__m256i) __builtin_ia32_psrld256_mask ((__v8si) __A, + (__v4si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srl_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psrld128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srl_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psrld128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srl_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m128i __B) +{ + return (__m256i) __builtin_ia32_psrlq256_mask ((__v4di) __A, + (__v2di) __B, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srl_epi64 (__mmask8 __U, __m256i __A, __m128i __B) +{ + return (__m256i) __builtin_ia32_psrlq256_mask ((__v4di) __A, + (__v2di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srl_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psrlq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srl_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psrlq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_and_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pandd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_and_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pandd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_scalef_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_scalef_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) +{ + return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_scalef_pd (__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_scalef_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_scalef_ps (__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B) +{ + return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_scalef_pd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_scalef_pd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) +{ + return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_scalef_pd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_scalef_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_scalef_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_scalef_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fmadd_pd (__m256d __A, __mmask8 __U, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fmadd_pd (__m256d __A, __m256d __B, __m256d __C, + __mmask8 __U) +{ + return (__m256d) __builtin_ia32_vfmaddpd256_mask3 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fmadd_pd (__mmask8 __U, __m256d __A, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddpd256_maskz ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmadd_pd (__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmadd_pd (__m128d __A, __m128d __B, __m128d __C, + __mmask8 __U) +{ + return (__m128d) __builtin_ia32_vfmaddpd128_mask3 ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmadd_pd (__mmask8 __U, __m128d __A, __m128d __B, + __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddpd128_maskz ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fmadd_ps (__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fmadd_ps (__m256 __A, __m256 __B, __m256 __C, + __mmask8 __U) +{ + return (__m256) __builtin_ia32_vfmaddps256_mask3 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fmadd_ps (__mmask8 __U, __m256 __A, __m256 __B, + __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddps256_maskz ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmadd_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmadd_ps (__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) +{ + return (__m128) __builtin_ia32_vfmaddps128_mask3 ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmadd_ps (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddps128_maskz ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fmsub_pd (__m256d __A, __mmask8 __U, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmsubpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fmsub_pd (__m256d __A, __m256d __B, __m256d __C, + __mmask8 __U) +{ + return (__m256d) __builtin_ia32_vfmsubpd256_mask3 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fmsub_pd (__mmask8 __U, __m256d __A, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmsubpd256_maskz ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmsub_pd (__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmsubpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmsub_pd (__m128d __A, __m128d __B, __m128d __C, + __mmask8 __U) +{ + return (__m128d) __builtin_ia32_vfmsubpd128_mask3 ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmsub_pd (__mmask8 __U, __m128d __A, __m128d __B, + __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmsubpd128_maskz ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fmsub_ps (__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_vfmsubps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fmsub_ps (__m256 __A, __m256 __B, __m256 __C, + __mmask8 __U) +{ + return (__m256) __builtin_ia32_vfmsubps256_mask3 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fmsub_ps (__mmask8 __U, __m256 __A, __m256 __B, + __m256 __C) +{ + return (__m256) __builtin_ia32_vfmsubps256_maskz ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmsub_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmsubps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmsub_ps (__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) +{ + return (__m128) __builtin_ia32_vfmsubps128_mask3 ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmsub_ps (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmsubps128_maskz ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fmaddsub_pd (__m256d __A, __mmask8 __U, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddsubpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fmaddsub_pd (__m256d __A, __m256d __B, __m256d __C, + __mmask8 __U) +{ + return (__m256d) __builtin_ia32_vfmaddsubpd256_mask3 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) + __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fmaddsub_pd (__mmask8 __U, __m256d __A, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddsubpd256_maskz ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) + __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmaddsub_pd (__m128d __A, __mmask8 __U, __m128d __B, + __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsubpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmaddsub_pd (__m128d __A, __m128d __B, __m128d __C, + __mmask8 __U) +{ + return (__m128d) __builtin_ia32_vfmaddsubpd128_mask3 ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) + __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmaddsub_pd (__mmask8 __U, __m128d __A, __m128d __B, + __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsubpd128_maskz ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) + __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fmaddsub_ps (__m256 __A, __mmask8 __U, __m256 __B, + __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddsubps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fmaddsub_ps (__m256 __A, __m256 __B, __m256 __C, + __mmask8 __U) +{ + return (__m256) __builtin_ia32_vfmaddsubps256_mask3 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fmaddsub_ps (__mmask8 __U, __m256 __A, __m256 __B, + __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddsubps256_maskz ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmaddsub_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddsubps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmaddsub_ps (__m128 __A, __m128 __B, __m128 __C, + __mmask8 __U) +{ + return (__m128) __builtin_ia32_vfmaddsubps128_mask3 ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmaddsub_ps (__mmask8 __U, __m128 __A, __m128 __B, + __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddsubps128_maskz ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fmsubadd_pd (__m256d __A, __mmask8 __U, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddsubpd256_mask ((__v4df) __A, + (__v4df) __B, + -(__v4df) __C, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fmsubadd_pd (__m256d __A, __m256d __B, __m256d __C, + __mmask8 __U) +{ + return (__m256d) __builtin_ia32_vfmsubaddpd256_mask3 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) + __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fmsubadd_pd (__mmask8 __U, __m256d __A, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddsubpd256_maskz ((__v4df) __A, + (__v4df) __B, + -(__v4df) __C, + (__mmask8) + __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmsubadd_pd (__m128d __A, __mmask8 __U, __m128d __B, + __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsubpd128_mask ((__v2df) __A, + (__v2df) __B, + -(__v2df) __C, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmsubadd_pd (__m128d __A, __m128d __B, __m128d __C, + __mmask8 __U) +{ + return (__m128d) __builtin_ia32_vfmsubaddpd128_mask3 ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) + __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmsubadd_pd (__mmask8 __U, __m128d __A, __m128d __B, + __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsubpd128_maskz ((__v2df) __A, + (__v2df) __B, + -(__v2df) __C, + (__mmask8) + __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fmsubadd_ps (__m256 __A, __mmask8 __U, __m256 __B, + __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddsubps256_mask ((__v8sf) __A, + (__v8sf) __B, + -(__v8sf) __C, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fmsubadd_ps (__m256 __A, __m256 __B, __m256 __C, + __mmask8 __U) +{ + return (__m256) __builtin_ia32_vfmsubaddps256_mask3 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fmsubadd_ps (__mmask8 __U, __m256 __A, __m256 __B, + __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddsubps256_maskz ((__v8sf) __A, + (__v8sf) __B, + -(__v8sf) __C, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fmsubadd_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddsubps128_mask ((__v4sf) __A, + (__v4sf) __B, + -(__v4sf) __C, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fmsubadd_ps (__m128 __A, __m128 __B, __m128 __C, + __mmask8 __U) +{ + return (__m128) __builtin_ia32_vfmsubaddps128_mask3 ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fmsubadd_ps (__mmask8 __U, __m128 __A, __m128 __B, + __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddsubps128_maskz ((__v4sf) __A, + (__v4sf) __B, + -(__v4sf) __C, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fnmadd_pd (__m256d __A, __mmask8 __U, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfnmaddpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fnmadd_pd (__m256d __A, __m256d __B, __m256d __C, + __mmask8 __U) +{ + return (__m256d) __builtin_ia32_vfnmaddpd256_mask3 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fnmadd_pd (__mmask8 __U, __m256d __A, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfnmaddpd256_maskz ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmadd_pd (__m128d __A, __mmask8 __U, __m128d __B, + __m128d __C) +{ + return (__m128d) __builtin_ia32_vfnmaddpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmadd_pd (__m128d __A, __m128d __B, __m128d __C, + __mmask8 __U) +{ + return (__m128d) __builtin_ia32_vfnmaddpd128_mask3 ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmadd_pd (__mmask8 __U, __m128d __A, __m128d __B, + __m128d __C) +{ + return (__m128d) __builtin_ia32_vfnmaddpd128_maskz ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fnmadd_ps (__m256 __A, __mmask8 __U, __m256 __B, + __m256 __C) +{ + return (__m256) __builtin_ia32_vfnmaddps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fnmadd_ps (__m256 __A, __m256 __B, __m256 __C, + __mmask8 __U) +{ + return (__m256) __builtin_ia32_vfnmaddps256_mask3 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fnmadd_ps (__mmask8 __U, __m256 __A, __m256 __B, + __m256 __C) +{ + return (__m256) __builtin_ia32_vfnmaddps256_maskz ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmadd_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfnmaddps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmadd_ps (__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) +{ + return (__m128) __builtin_ia32_vfnmaddps128_mask3 ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmadd_ps (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfnmaddps128_maskz ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fnmsub_pd (__m256d __A, __mmask8 __U, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfnmsubpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fnmsub_pd (__m256d __A, __m256d __B, __m256d __C, + __mmask8 __U) +{ + return (__m256d) __builtin_ia32_vfnmsubpd256_mask3 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fnmsub_pd (__mmask8 __U, __m256d __A, __m256d __B, + __m256d __C) +{ + return (__m256d) __builtin_ia32_vfnmsubpd256_maskz ((__v4df) __A, + (__v4df) __B, + (__v4df) __C, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmsub_pd (__m128d __A, __mmask8 __U, __m128d __B, + __m128d __C) +{ + return (__m128d) __builtin_ia32_vfnmsubpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmsub_pd (__m128d __A, __m128d __B, __m128d __C, + __mmask8 __U) +{ + return (__m128d) __builtin_ia32_vfnmsubpd128_mask3 ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmsub_pd (__mmask8 __U, __m128d __A, __m128d __B, + __m128d __C) +{ + return (__m128d) __builtin_ia32_vfnmsubpd128_maskz ((__v2df) __A, + (__v2df) __B, + (__v2df) __C, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fnmsub_ps (__m256 __A, __mmask8 __U, __m256 __B, + __m256 __C) +{ + return (__m256) __builtin_ia32_vfnmsubps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask3_fnmsub_ps (__m256 __A, __m256 __B, __m256 __C, + __mmask8 __U) +{ + return (__m256) __builtin_ia32_vfnmsubps256_mask3 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fnmsub_ps (__mmask8 __U, __m256 __A, __m256 __B, + __m256 __C) +{ + return (__m256) __builtin_ia32_vfnmsubps256_maskz ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fnmsub_ps (__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfnmsubps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask3_fnmsub_ps (__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) +{ + return (__m128) __builtin_ia32_vfnmsubps128_mask3 ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fnmsub_ps (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfnmsubps128_maskz ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_and_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pandd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_and_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pandd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_andnot_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pandnd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_andnot_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pandnd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_andnot_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pandnd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_andnot_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pandnd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_or_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pord256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_or_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pord256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_or_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v8su)__A | (__v8su)__B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_or_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pord128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_or_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pord128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_or_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v4su)__A | (__v4su)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_xor_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pxord256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_xor_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pxord256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_xor_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v8su)__A ^ (__v8su)__B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_xor_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pxord128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_xor_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pxord128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_xor_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v4su)__A ^ (__v4su)__B); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m128d __A) +{ + return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtpd_ps (__mmask8 __U, __m128d __A) +{ + return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m256d __A) +{ + return (__m128) __builtin_ia32_cvtpd2ps256_mask ((__v4df) __A, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtpd_ps (__mmask8 __U, __m256d __A) +{ + return (__m128) __builtin_ia32_cvtpd2ps256_mask ((__v4df) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) +{ + return (__m256i) __builtin_ia32_cvtps2dq256_mask ((__v8sf) __A, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtps_epi32 (__mmask8 __U, __m256 __A) +{ + return (__m256i) __builtin_ia32_cvtps2dq256_mask ((__v8sf) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvtps2dq128_mask ((__v4sf) __A, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtps_epi32 (__mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvtps2dq128_mask ((__v4sf) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtps_epu32 (__m256 __A) +{ + return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtps_epu32 (__m256i __W, __mmask8 __U, __m256 __A) +{ + return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtps_epu32 (__mmask8 __U, __m256 __A) +{ + return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtps_epu32 (__m128 __A) +{ + return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtps_epu32 (__m128i __W, __mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtps_epu32 (__mmask8 __U, __m128 __A) +{ + return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_movedup_pd (__m256d __W, __mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_movddup256_mask ((__v4df) __A, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_movedup_pd (__mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_movddup256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_movedup_pd (__m128d __W, __mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_movddup128_mask ((__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_movedup_pd (__mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_movddup128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_movehdup_ps (__m256 __W, __mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_movshdup256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_movehdup_ps (__mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_movshdup256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_movehdup_ps (__m128 __W, __mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_movshdup128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_movehdup_ps (__mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_movshdup128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_moveldup_ps (__m256 __W, __mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_movsldup256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_moveldup_ps (__mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_movsldup256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_moveldup_ps (__m128 __W, __mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_movsldup128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_moveldup_ps (__mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_movsldup128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_unpackhi_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_punpckhdq128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_unpackhi_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_punpckhdq128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_unpackhi_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_punpckhdq256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_unpackhi_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_punpckhdq256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_unpackhi_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_punpckhqdq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_unpackhi_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_punpckhqdq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_unpackhi_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_punpckhqdq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_unpackhi_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_punpckhqdq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_unpacklo_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_punpckldq128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_unpacklo_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_punpckldq128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_unpacklo_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_punpckldq256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_unpacklo_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_punpckldq256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_unpacklo_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_punpcklqdq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_unpacklo_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_punpcklqdq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_unpacklo_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_punpcklqdq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_unpacklo_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_punpcklqdq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epu32_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __A, + (__v4si) __B, 0, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epi32_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqd128_mask ((__v4si) __A, + (__v4si) __B, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpeq_epu32_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __A, + (__v4si) __B, 0, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpeq_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqd128_mask ((__v4si) __A, + (__v4si) __B, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epu32_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __A, + (__v8si) __B, 0, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epi32_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqd256_mask ((__v8si) __A, + (__v8si) __B, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpeq_epu32_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __A, + (__v8si) __B, 0, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpeq_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqd256_mask ((__v8si) __A, + (__v8si) __B, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epu64_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __A, + (__v2di) __B, 0, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epi64_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqq128_mask ((__v2di) __A, + (__v2di) __B, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpeq_epu64_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __A, + (__v2di) __B, 0, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpeq_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqq128_mask ((__v2di) __A, + (__v2di) __B, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epu64_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __A, + (__v4di) __B, 0, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpeq_epi64_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqq256_mask ((__v4di) __A, + (__v4di) __B, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpeq_epu64_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __A, + (__v4di) __B, 0, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpeq_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_pcmpeqq256_mask ((__v4di) __A, + (__v4di) __B, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epu32_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __A, + (__v4si) __B, 6, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epi32_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtd128_mask ((__v4si) __A, + (__v4si) __B, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpgt_epu32_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __A, + (__v4si) __B, 6, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpgt_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtd128_mask ((__v4si) __A, + (__v4si) __B, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epu32_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __A, + (__v8si) __B, 6, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epi32_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtd256_mask ((__v8si) __A, + (__v8si) __B, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpgt_epu32_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __A, + (__v8si) __B, 6, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpgt_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtd256_mask ((__v8si) __A, + (__v8si) __B, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epu64_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __A, + (__v2di) __B, 6, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epi64_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtq128_mask ((__v2di) __A, + (__v2di) __B, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpgt_epu64_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __A, + (__v2di) __B, 6, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpgt_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtq128_mask ((__v2di) __A, + (__v2di) __B, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epu64_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __A, + (__v4di) __B, 6, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpgt_epi64_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtq256_mask ((__v4di) __A, + (__v4di) __B, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpgt_epu64_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __A, + (__v4di) __B, 6, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpgt_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_pcmpgtq256_mask ((__v4di) __A, + (__v4di) __B, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_test_epi32_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ptestmd128 ((__v4si) __A, + (__v4si) __B, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_test_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ptestmd128 ((__v4si) __A, + (__v4si) __B, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_test_epi32_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ptestmd256 ((__v8si) __A, + (__v8si) __B, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_test_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ptestmd256 ((__v8si) __A, + (__v8si) __B, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_test_epi64_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ptestmq128 ((__v2di) __A, + (__v2di) __B, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_test_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ptestmq128 ((__v2di) __A, + (__v2di) __B, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_test_epi64_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ptestmq256 ((__v4di) __A, + (__v4di) __B, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_test_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ptestmq256 ((__v4di) __A, + (__v4di) __B, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testn_epi32_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmd128 ((__v4si) __A, + (__v4si) __B, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_testn_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmd128 ((__v4si) __A, + (__v4si) __B, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testn_epi32_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmd256 ((__v8si) __A, + (__v8si) __B, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_testn_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmd256 ((__v8si) __A, + (__v8si) __B, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testn_epi64_mask (__m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmq128 ((__v2di) __A, + (__v2di) __B, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_testn_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmq128 ((__v2di) __A, + (__v2di) __B, __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testn_epi64_mask (__m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmq256 ((__v4di) __A, + (__v4di) __B, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_testn_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__mmask8) __builtin_ia32_ptestnmq256 ((__v4di) __A, + (__v4di) __B, __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_compress_pd (__m256d __W, __mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_compress_pd (__mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m256d __A) +{ + __builtin_ia32_compressstoredf256_mask ((__v4df *) __P, + (__v4df) __A, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_compress_pd (__m128d __W, __mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_compress_pd (__mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m128d __A) +{ + __builtin_ia32_compressstoredf128_mask ((__v2df *) __P, + (__v2df) __A, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_compress_ps (__m256 __W, __mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_compress_ps (__mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m256 __A) +{ + __builtin_ia32_compressstoresf256_mask ((__v8sf *) __P, + (__v8sf) __A, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_compress_ps (__m128 __W, __mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_compress_ps (__mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m128 __A) +{ + __builtin_ia32_compressstoresf128_mask ((__v4sf *) __P, + (__v4sf) __A, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_compress_epi64 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_compress_epi64 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m256i __A) +{ + __builtin_ia32_compressstoredi256_mask ((__v4di *) __P, + (__v4di) __A, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_compress_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_compress_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m128i __A) +{ + __builtin_ia32_compressstoredi128_mask ((__v2di *) __P, + (__v2di) __A, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_compress_epi32 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_compress_epi32 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m256i __A) +{ + __builtin_ia32_compressstoresi256_mask ((__v8si *) __P, + (__v8si) __A, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_compress_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_compress_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m128i __A) +{ + __builtin_ia32_compressstoresi128_mask ((__v4si *) __P, + (__v4si) __A, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_expand_pd (__m256d __W, __mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_expanddf256_mask ((__v4df) __A, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_expand_pd (__mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_expanddf256_maskz ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_expandloadu_pd (__m256d __W, __mmask8 __U, void const *__P) +{ + return (__m256d) __builtin_ia32_expandloaddf256_mask ((__v4df *) __P, + (__v4df) __W, + (__mmask8) + __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_expandloadu_pd (__mmask8 __U, void const *__P) +{ + return (__m256d) __builtin_ia32_expandloaddf256_maskz ((__v4df *) __P, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) + __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_expand_pd (__m128d __W, __mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_expanddf128_mask ((__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_expand_pd (__mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_expanddf128_maskz ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_expandloadu_pd (__m128d __W, __mmask8 __U, void const *__P) +{ + return (__m128d) __builtin_ia32_expandloaddf128_mask ((__v2df *) __P, + (__v2df) __W, + (__mmask8) + __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_expandloadu_pd (__mmask8 __U, void const *__P) +{ + return (__m128d) __builtin_ia32_expandloaddf128_maskz ((__v2df *) __P, + (__v2df) + _mm_setzero_pd (), + (__mmask8) + __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_expand_ps (__m256 __W, __mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_expandsf256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_expand_ps (__mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_expandsf256_maskz ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_expandloadu_ps (__m256 __W, __mmask8 __U, void const *__P) +{ + return (__m256) __builtin_ia32_expandloadsf256_mask ((__v8sf *) __P, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_expandloadu_ps (__mmask8 __U, void const *__P) +{ + return (__m256) __builtin_ia32_expandloadsf256_maskz ((__v8sf *) __P, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) + __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_expand_ps (__m128 __W, __mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_expandsf128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_expand_ps (__mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_expandsf128_maskz ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_expandloadu_ps (__m128 __W, __mmask8 __U, void const *__P) +{ + return (__m128) __builtin_ia32_expandloadsf128_mask ((__v4sf *) __P, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_expandloadu_ps (__mmask8 __U, void const *__P) +{ + return (__m128) __builtin_ia32_expandloadsf128_maskz ((__v4sf *) __P, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) + __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_expand_epi64 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_expanddi256_mask ((__v4di) __A, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_expand_epi64 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_expanddi256_maskz ((__v4di) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_expandloadu_epi64 (__m256i __W, __mmask8 __U, + void const *__P) +{ + return (__m256i) __builtin_ia32_expandloaddi256_mask ((__v4di *) __P, + (__v4di) __W, + (__mmask8) + __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_expandloaddi256_maskz ((__v4di *) __P, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) + __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_expand_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_expanddi128_mask ((__v2di) __A, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_expand_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_expanddi128_maskz ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_expandloadu_epi64 (__m128i __W, __mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_expandloaddi128_mask ((__v2di *) __P, + (__v2di) __W, + (__mmask8) + __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_expandloaddi128_maskz ((__v2di *) __P, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) + __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_expand_epi32 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_expandsi256_mask ((__v8si) __A, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_expand_epi32 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_expandsi256_maskz ((__v8si) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_expandloadu_epi32 (__m256i __W, __mmask8 __U, + void const *__P) +{ + return (__m256i) __builtin_ia32_expandloadsi256_mask ((__v8si *) __P, + (__v8si) __W, + (__mmask8) + __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_expandloadsi256_maskz ((__v8si *) __P, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) + __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_expand_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_expandsi128_mask ((__v4si) __A, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_expand_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_expandsi128_maskz ((__v4si) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_expandloadu_epi32 (__m128i __W, __mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_expandloadsi128_mask ((__v4si *) __P, + (__v4si) __W, + (__mmask8) + __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_expandloadsi128_maskz ((__v4si *) __P, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) + __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutex2var_pd (__m256d __A, __m256i __I, __m256d __B) +{ + return (__m256d) __builtin_ia32_vpermt2varpd256_mask ((__v4di) __I + /* idx */ , + (__v4df) __A, + (__v4df) __B, + (__mmask8) -1); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutex2var_pd (__m256d __A, __mmask8 __U, __m256i __I, + __m256d __B) +{ + return (__m256d) __builtin_ia32_vpermt2varpd256_mask ((__v4di) __I + /* idx */ , + (__v4df) __A, + (__v4df) __B, + (__mmask8) + __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask2_permutex2var_pd (__m256d __A, __m256i __I, __mmask8 __U, + __m256d __B) +{ + return (__m256d) __builtin_ia32_vpermi2varpd256_mask ((__v4df) __A, + (__v4di) __I + /* idx */ , + (__v4df) __B, + (__mmask8) + __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutex2var_pd (__mmask8 __U, __m256d __A, __m256i __I, + __m256d __B) +{ + return (__m256d) __builtin_ia32_vpermt2varpd256_maskz ((__v4di) __I + /* idx */ , + (__v4df) __A, + (__v4df) __B, + (__mmask8) + __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutex2var_ps (__m256 __A, __m256i __I, __m256 __B) +{ + return (__m256) __builtin_ia32_vpermt2varps256_mask ((__v8si) __I + /* idx */ , + (__v8sf) __A, + (__v8sf) __B, + (__mmask8) -1); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutex2var_ps (__m256 __A, __mmask8 __U, __m256i __I, + __m256 __B) +{ + return (__m256) __builtin_ia32_vpermt2varps256_mask ((__v8si) __I + /* idx */ , + (__v8sf) __A, + (__v8sf) __B, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask2_permutex2var_ps (__m256 __A, __m256i __I, __mmask8 __U, + __m256 __B) +{ + return (__m256) __builtin_ia32_vpermi2varps256_mask ((__v8sf) __A, + (__v8si) __I + /* idx */ , + (__v8sf) __B, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutex2var_ps (__mmask8 __U, __m256 __A, __m256i __I, + __m256 __B) +{ + return (__m256) __builtin_ia32_vpermt2varps256_maskz ((__v8si) __I + /* idx */ , + (__v8sf) __A, + (__v8sf) __B, + (__mmask8) + __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permutex2var_epi64 (__m128i __A, __m128i __I, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermt2varq128_mask ((__v2di) __I + /* idx */ , + (__v2di) __A, + (__v2di) __B, + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_permutex2var_epi64 (__m128i __A, __mmask8 __U, __m128i __I, + __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermt2varq128_mask ((__v2di) __I + /* idx */ , + (__v2di) __A, + (__v2di) __B, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask2_permutex2var_epi64 (__m128i __A, __m128i __I, __mmask8 __U, + __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermi2varq128_mask ((__v2di) __A, + (__v2di) __I + /* idx */ , + (__v2di) __B, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_permutex2var_epi64 (__mmask8 __U, __m128i __A, __m128i __I, + __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermt2varq128_maskz ((__v2di) __I + /* idx */ , + (__v2di) __A, + (__v2di) __B, + (__mmask8) + __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permutex2var_epi32 (__m128i __A, __m128i __I, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermt2vard128_mask ((__v4si) __I + /* idx */ , + (__v4si) __A, + (__v4si) __B, + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_permutex2var_epi32 (__m128i __A, __mmask8 __U, __m128i __I, + __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermt2vard128_mask ((__v4si) __I + /* idx */ , + (__v4si) __A, + (__v4si) __B, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask2_permutex2var_epi32 (__m128i __A, __m128i __I, __mmask8 __U, + __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermi2vard128_mask ((__v4si) __A, + (__v4si) __I + /* idx */ , + (__v4si) __B, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_permutex2var_epi32 (__mmask8 __U, __m128i __A, __m128i __I, + __m128i __B) +{ + return (__m128i) __builtin_ia32_vpermt2vard128_maskz ((__v4si) __I + /* idx */ , + (__v4si) __A, + (__v4si) __B, + (__mmask8) + __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutex2var_epi64 (__m256i __A, __m256i __I, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermt2varq256_mask ((__v4di) __I + /* idx */ , + (__v4di) __A, + (__v4di) __B, + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutex2var_epi64 (__m256i __A, __mmask8 __U, __m256i __I, + __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermt2varq256_mask ((__v4di) __I + /* idx */ , + (__v4di) __A, + (__v4di) __B, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask2_permutex2var_epi64 (__m256i __A, __m256i __I, + __mmask8 __U, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermi2varq256_mask ((__v4di) __A, + (__v4di) __I + /* idx */ , + (__v4di) __B, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutex2var_epi64 (__mmask8 __U, __m256i __A, + __m256i __I, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermt2varq256_maskz ((__v4di) __I + /* idx */ , + (__v4di) __A, + (__v4di) __B, + (__mmask8) + __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutex2var_epi32 (__m256i __A, __m256i __I, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermt2vard256_mask ((__v8si) __I + /* idx */ , + (__v8si) __A, + (__v8si) __B, + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutex2var_epi32 (__m256i __A, __mmask8 __U, __m256i __I, + __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermt2vard256_mask ((__v8si) __I + /* idx */ , + (__v8si) __A, + (__v8si) __B, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask2_permutex2var_epi32 (__m256i __A, __m256i __I, + __mmask8 __U, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermi2vard256_mask ((__v8si) __A, + (__v8si) __I + /* idx */ , + (__v8si) __B, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutex2var_epi32 (__mmask8 __U, __m256i __A, + __m256i __I, __m256i __B) +{ + return (__m256i) __builtin_ia32_vpermt2vard256_maskz ((__v8si) __I + /* idx */ , + (__v8si) __A, + (__v8si) __B, + (__mmask8) + __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permutex2var_pd (__m128d __A, __m128i __I, __m128d __B) +{ + return (__m128d) __builtin_ia32_vpermt2varpd128_mask ((__v2di) __I + /* idx */ , + (__v2df) __A, + (__v2df) __B, + (__mmask8) -1); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_permutex2var_pd (__m128d __A, __mmask8 __U, __m128i __I, + __m128d __B) +{ + return (__m128d) __builtin_ia32_vpermt2varpd128_mask ((__v2di) __I + /* idx */ , + (__v2df) __A, + (__v2df) __B, + (__mmask8) + __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask2_permutex2var_pd (__m128d __A, __m128i __I, __mmask8 __U, + __m128d __B) +{ + return (__m128d) __builtin_ia32_vpermi2varpd128_mask ((__v2df) __A, + (__v2di) __I + /* idx */ , + (__v2df) __B, + (__mmask8) + __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_permutex2var_pd (__mmask8 __U, __m128d __A, __m128i __I, + __m128d __B) +{ + return (__m128d) __builtin_ia32_vpermt2varpd128_maskz ((__v2di) __I + /* idx */ , + (__v2df) __A, + (__v2df) __B, + (__mmask8) + __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permutex2var_ps (__m128 __A, __m128i __I, __m128 __B) +{ + return (__m128) __builtin_ia32_vpermt2varps128_mask ((__v4si) __I + /* idx */ , + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) -1); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_permutex2var_ps (__m128 __A, __mmask8 __U, __m128i __I, + __m128 __B) +{ + return (__m128) __builtin_ia32_vpermt2varps128_mask ((__v4si) __I + /* idx */ , + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask2_permutex2var_ps (__m128 __A, __m128i __I, __mmask8 __U, + __m128 __B) +{ + return (__m128) __builtin_ia32_vpermi2varps128_mask ((__v4sf) __A, + (__v4si) __I + /* idx */ , + (__v4sf) __B, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_permutex2var_ps (__mmask8 __U, __m128 __A, __m128i __I, + __m128 __B) +{ + return (__m128) __builtin_ia32_vpermt2varps128_maskz ((__v4si) __I + /* idx */ , + (__v4sf) __A, + (__v4sf) __B, + (__mmask8) + __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srav_epi64 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X, + (__v2di) __Y, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srav_epi64 (__m128i __W, __mmask8 __U, __m128i __X, + __m128i __Y) +{ + return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X, + (__v2di) __Y, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srav_epi64 (__mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psravq128_mask ((__v2di) __X, + (__v2di) __Y, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sllv_epi32 (__m256i __W, __mmask8 __U, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_psllv8si_mask ((__v8si) __X, + (__v8si) __Y, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sllv_epi32 (__mmask8 __U, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psllv8si_mask ((__v8si) __X, + (__v8si) __Y, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sllv_epi32 (__m128i __W, __mmask8 __U, __m128i __X, + __m128i __Y) +{ + return (__m128i) __builtin_ia32_psllv4si_mask ((__v4si) __X, + (__v4si) __Y, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sllv_epi32 (__mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psllv4si_mask ((__v4si) __X, + (__v4si) __Y, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sllv_epi64 (__m256i __W, __mmask8 __U, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_psllv4di_mask ((__v4di) __X, + (__v4di) __Y, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sllv_epi64 (__mmask8 __U, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psllv4di_mask ((__v4di) __X, + (__v4di) __Y, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sllv_epi64 (__m128i __W, __mmask8 __U, __m128i __X, + __m128i __Y) +{ + return (__m128i) __builtin_ia32_psllv2di_mask ((__v2di) __X, + (__v2di) __Y, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sllv_epi64 (__mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psllv2di_mask ((__v2di) __X, + (__v2di) __Y, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srav_epi32 (__m256i __W, __mmask8 __U, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_psrav8si_mask ((__v8si) __X, + (__v8si) __Y, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srav_epi32 (__mmask8 __U, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psrav8si_mask ((__v8si) __X, + (__v8si) __Y, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srav_epi32 (__m128i __W, __mmask8 __U, __m128i __X, + __m128i __Y) +{ + return (__m128i) __builtin_ia32_psrav4si_mask ((__v4si) __X, + (__v4si) __Y, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srav_epi32 (__mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psrav4si_mask ((__v4si) __X, + (__v4si) __Y, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srlv_epi32 (__m256i __W, __mmask8 __U, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_psrlv8si_mask ((__v8si) __X, + (__v8si) __Y, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srlv_epi32 (__mmask8 __U, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psrlv8si_mask ((__v8si) __X, + (__v8si) __Y, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srlv_epi32 (__m128i __W, __mmask8 __U, __m128i __X, + __m128i __Y) +{ + return (__m128i) __builtin_ia32_psrlv4si_mask ((__v4si) __X, + (__v4si) __Y, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srlv_epi32 (__mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psrlv4si_mask ((__v4si) __X, + (__v4si) __Y, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srlv_epi64 (__m256i __W, __mmask8 __U, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_psrlv4di_mask ((__v4di) __X, + (__v4di) __Y, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srlv_epi64 (__mmask8 __U, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psrlv4di_mask ((__v4di) __X, + (__v4di) __Y, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srlv_epi64 (__m128i __W, __mmask8 __U, __m128i __X, + __m128i __Y) +{ + return (__m128i) __builtin_ia32_psrlv2di_mask ((__v2di) __X, + (__v2di) __Y, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srlv_epi64 (__mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psrlv2di_mask ((__v2di) __X, + (__v2di) __Y, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rolv_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rolv_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_rolv_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_prolvd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rolv_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rolv_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rolv_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_prolvd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rorv_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rorv_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_rorv_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_prorvd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rorv_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rorv_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rorv_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_prorvd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rolv_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rolv_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_rolv_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_prolvq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rolv_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rolv_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rolv_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_prolvq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rorv_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rorv_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_rorv_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_prorvq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rorv_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rorv_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rorv_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_prorvq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srav_epi64 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X, + (__v4di) __Y, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srav_epi64 (__m256i __W, __mmask8 __U, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X, + (__v4di) __Y, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srav_epi64 (__mmask8 __U, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_psravq256_mask ((__v4di) __X, + (__v4di) __Y, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_and_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pandq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_and_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pandq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_pd (), + __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_and_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pandq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_and_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pandq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_pd (), + __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_andnot_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pandnq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_andnot_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pandnq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_pd (), + __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_andnot_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pandnq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_andnot_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pandnq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_pd (), + __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_or_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_porq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_or_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_porq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_or_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v4du)__A | (__v4du)__B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_or_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_porq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_or_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_porq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_or_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v2du)__A | (__v2du)__B); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_xor_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pxorq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_xor_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pxorq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_xor_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i) ((__v4du)__A ^ (__v4du)__B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_xor_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pxorq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_xor_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pxorq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_xor_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v2du)__A ^ (__v2du)__B); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_max_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) +{ + return (__m256d) __builtin_ia32_maxpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_max_pd (__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_maxpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_max_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_maxps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_max_ps (__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_maxps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_div_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_divps_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_div_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_divps_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_div_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_divpd_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_div_pd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_divpd_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_min_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) +{ + return (__m256d) __builtin_ia32_minpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_div_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) +{ + return (__m256d) __builtin_ia32_divpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_min_pd (__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_minpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_min_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_minps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_div_pd (__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_divpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_div_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_divps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_min_ps (__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_minps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_div_ps (__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_divps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_minps_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mul_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_mulps_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_minps_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mul_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_mulps_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_maxps_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_maxps_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_minpd_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_pd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_minpd_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_maxpd_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_pd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_maxpd_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mul_pd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_mulpd_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mul_pd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_mulpd_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mul_ps (__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_mulps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mul_ps (__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_mulps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mul_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) +{ + return (__m256d) __builtin_ia32_mulpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mul_pd (__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_mulpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_max_epi64 (__mmask8 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_max_epi64 (__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_min_epi64 (__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_min_epi64 (__mmask8 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pminsq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_max_epu64 (__mmask8 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxsq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_epu64 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_max_epu64 (__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxuq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_epu64 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_min_epu64 (__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __W, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_min_epu64 (__mmask8 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pminuq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_max_epi32 (__mmask8 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxsd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_max_epi32 (__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxsd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_min_epi32 (__mmask8 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pminsd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_min_epi32 (__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pminsd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_max_epu32 (__mmask8 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxud256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_max_epu32 (__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmaxud256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_min_epu32 (__mmask8 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pminud256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_min_epu32 (__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pminud256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_epi64 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_epi64 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_epi64 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_epi64 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pminsq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_epu64 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxsq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epu64 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_epu64 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxuq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epu64 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_epu64 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_epu64 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pminuq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_epi32 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxsd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_epi32 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxsd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_epi32 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pminsd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_epi32 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pminsd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_max_epu32 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxud128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_max_epu32 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmaxud128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_min_epu32 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pminud128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_min_epu32 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pminud128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, __M); +} + +#ifndef __AVX512CD__ +#pragma GCC push_options +#pragma GCC target("avx512vl,avx512cd") +#define __DISABLE_AVX512VLCD__ +#endif + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcastmb_epi64 (__mmask8 __A) +{ + return (__m128i) __builtin_ia32_broadcastmb128 (__A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcastmb_epi64 (__mmask8 __A) +{ + return (__m256i) __builtin_ia32_broadcastmb256 (__A); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcastmw_epi32 (__mmask16 __A) +{ + return (__m128i) __builtin_ia32_broadcastmw128 (__A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcastmw_epi32 (__mmask16 __A) +{ + return (__m256i) __builtin_ia32_broadcastmw256 (__A); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_lzcnt_epi32 (__m256i __A) +{ + return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vplzcntd_256_mask ((__v8si) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_lzcnt_epi64 (__m256i __A) +{ + return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vplzcntq_256_mask ((__v4di) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_conflict_epi64 (__m256i __A) +{ + return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A, + (__v4di) __W, + (__mmask8) + __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vpconflictdi_256_mask ((__v4di) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) + __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_conflict_epi32 (__m256i __A) +{ + return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A, + (__v8si) __W, + (__mmask8) + __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vpconflictsi_256_mask ((__v8si) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) + __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_lzcnt_epi32 (__m128i __A) +{ + return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vplzcntd_128_mask ((__v4si) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_lzcnt_epi64 (__m128i __A) +{ + return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vplzcntq_128_mask ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_conflict_epi64 (__m128i __A) +{ + return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A, + (__v2di) __W, + (__mmask8) + __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vpconflictdi_128_mask ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) + __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_conflict_epi32 (__m128i __A) +{ + return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A, + (__v4si) __W, + (__mmask8) + __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vpconflictsi_128_mask ((__v4si) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) + __U); +} + +#ifdef __DISABLE_AVX512VLCD__ +#pragma GCC pop_options +#endif + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_unpacklo_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) +{ + return (__m256d) __builtin_ia32_unpcklpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_unpacklo_pd (__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_unpcklpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_unpacklo_pd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) +{ + return (__m128d) __builtin_ia32_unpcklpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_unpacklo_pd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_unpcklpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_unpacklo_ps (__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B) +{ + return (__m256) __builtin_ia32_unpcklps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_unpackhi_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) +{ + return (__m256d) __builtin_ia32_unpckhpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_unpackhi_pd (__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_unpckhpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_unpackhi_pd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) +{ + return (__m128d) __builtin_ia32_unpckhpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_unpackhi_pd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_unpckhpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_unpackhi_ps (__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B) +{ + return (__m256) __builtin_ia32_unpckhps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_unpackhi_ps (__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_unpckhps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_unpackhi_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_unpckhps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_unpackhi_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_unpckhps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtph_ps (__m128 __W, __mmask8 __U, __m128i __A) +{ + return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtph_ps (__mmask8 __U, __m128i __A) +{ + return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_unpacklo_ps (__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_unpcklps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtph_ps (__m256 __W, __mmask8 __U, __m128i __A) +{ + return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtph_ps (__mmask8 __U, __m128i __A) +{ + return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_unpacklo_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_unpcklps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_unpacklo_ps (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_unpcklps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sra_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m128i __B) +{ + return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A, + (__v4si) __B, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sra_epi32 (__mmask8 __U, __m256i __A, __m128i __B) +{ + return (__m256i) __builtin_ia32_psrad256_mask ((__v8si) __A, + (__v4si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sra_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sra_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psrad128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sra_epi64 (__m256i __A, __m128i __B) +{ + return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A, + (__v2di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sra_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m128i __B) +{ + return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A, + (__v2di) __B, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sra_epi64 (__mmask8 __U, __m256i __A, __m128i __B) +{ + return (__m256i) __builtin_ia32_psraq256_mask ((__v4di) __A, + (__v2di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sra_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sra_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sra_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psraq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sll_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sll_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pslld128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_sll_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_sll_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_psllq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sll_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m128i __B) +{ + return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A, + (__v4si) __B, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sll_epi32 (__mmask8 __U, __m256i __A, __m128i __B) +{ + return (__m256i) __builtin_ia32_pslld256_mask ((__v8si) __A, + (__v4si) __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_sll_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m128i __B) +{ + return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A, + (__v2di) __B, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_sll_epi64 (__mmask8 __U, __m256i __A, __m128i __B) +{ + return (__m256i) __builtin_ia32_psllq256_mask ((__v4di) __A, + (__v2di) __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutexvar_ps (__m256 __W, __mmask8 __U, __m256i __X, + __m256 __Y) +{ + return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y, + (__v8si) __X, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutexvar_ps (__mmask8 __U, __m256i __X, __m256 __Y) +{ + return (__m256) __builtin_ia32_permvarsf256_mask ((__v8sf) __Y, + (__v8si) __X, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutexvar_pd (__m256i __X, __m256d __Y) +{ + return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y, + (__v4di) __X, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutexvar_pd (__m256d __W, __mmask8 __U, __m256i __X, + __m256d __Y) +{ + return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y, + (__v4di) __X, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutexvar_pd (__mmask8 __U, __m256i __X, __m256d __Y) +{ + return (__m256d) __builtin_ia32_permvardf256_mask ((__v4df) __Y, + (__v4di) __X, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutevar_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256i __C) +{ + return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A, + (__v4di) __C, + (__v4df) __W, + (__mmask8) + __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutevar_pd (__mmask8 __U, __m256d __A, __m256i __C) +{ + return (__m256d) __builtin_ia32_vpermilvarpd256_mask ((__v4df) __A, + (__v4di) __C, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) + __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutevar_ps (__m256 __W, __mmask8 __U, __m256 __A, + __m256i __C) +{ + return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A, + (__v8si) __C, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutevar_ps (__mmask8 __U, __m256 __A, __m256i __C) +{ + return (__m256) __builtin_ia32_vpermilvarps256_mask ((__v8sf) __A, + (__v8si) __C, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_permutevar_pd (__m128d __W, __mmask8 __U, __m128d __A, + __m128i __C) +{ + return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A, + (__v2di) __C, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_permutevar_pd (__mmask8 __U, __m128d __A, __m128i __C) +{ + return (__m128d) __builtin_ia32_vpermilvarpd_mask ((__v2df) __A, + (__v2di) __C, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_permutevar_ps (__m128 __W, __mmask8 __U, __m128 __A, + __m128i __C) +{ + return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A, + (__v4si) __C, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_permutevar_ps (__mmask8 __U, __m128 __A, __m128i __C) +{ + return (__m128) __builtin_ia32_vpermilvarps_mask ((__v4sf) __A, + (__v4si) __C, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mullo_epi32 (__mmask8 __M, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutexvar_epi64 (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y, + (__v4di) __X, + (__v4di) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mullo_epi32 (__m256i __W, __mmask8 __M, __m256i __A, + __m256i __B) +{ + return (__m256i) __builtin_ia32_pmulld256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __W, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mullo_epi32 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mullo_epi32 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i) __builtin_ia32_pmulld128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __W, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mul_epi32 (__m256i __W, __mmask8 __M, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X, + (__v8si) __Y, + (__v4di) __W, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mul_epi32 (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmuldq256_mask ((__v8si) __X, + (__v8si) __Y, + (__v4di) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mul_epi32 (__m128i __W, __mmask8 __M, __m128i __X, + __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X, + (__v4si) __Y, + (__v2di) __W, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mul_epi32 (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmuldq128_mask ((__v4si) __X, + (__v4si) __Y, + (__v2di) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutexvar_epi64 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y, + (__v4di) __X, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutexvar_epi64 (__m256i __W, __mmask8 __M, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_permvardi256_mask ((__v4di) __Y, + (__v4di) __X, + (__v4di) __W, + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_mul_epu32 (__m256i __W, __mmask8 __M, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X, + (__v8si) __Y, + (__v4di) __W, __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutexvar_epi32 (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y, + (__v8si) __X, + (__v8si) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_mul_epu32 (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_pmuludq256_mask ((__v8si) __X, + (__v8si) __Y, + (__v4di) + _mm256_setzero_si256 (), + __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_mul_epu32 (__m128i __W, __mmask8 __M, __m128i __X, + __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X, + (__v4si) __Y, + (__v2di) __W, __M); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_mul_epu32 (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmuludq128_mask ((__v4si) __X, + (__v4si) __Y, + (__v2di) + _mm_setzero_si128 (), + __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutexvar_epi32 (__m256i __X, __m256i __Y) +{ + return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y, + (__v8si) __X, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutexvar_epi32 (__m256i __W, __mmask8 __M, __m256i __X, + __m256i __Y) +{ + return (__m256i) __builtin_ia32_permvarsi256_mask ((__v8si) __Y, + (__v8si) __X, + (__v8si) __W, + __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpneq_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, + (__v8si) __Y, 4, + (__mmask8) __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpneq_epu32_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, + (__v8si) __Y, 4, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmplt_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, + (__v8si) __Y, 1, + (__mmask8) __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmplt_epu32_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, + (__v8si) __Y, 1, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpge_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, + (__v8si) __Y, 5, + (__mmask8) __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpge_epu32_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, + (__v8si) __Y, 5, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmple_epu32_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, + (__v8si) __Y, 2, + (__mmask8) __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmple_epu32_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, + (__v8si) __Y, 2, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpneq_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, + (__v4di) __Y, 4, + (__mmask8) __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpneq_epu64_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, + (__v4di) __Y, 4, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmplt_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, + (__v4di) __Y, 1, + (__mmask8) __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmplt_epu64_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, + (__v4di) __Y, 1, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpge_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, + (__v4di) __Y, 5, + (__mmask8) __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpge_epu64_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, + (__v4di) __Y, 5, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmple_epu64_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, + (__v4di) __Y, 2, + (__mmask8) __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmple_epu64_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, + (__v4di) __Y, 2, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpneq_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, + (__v8si) __Y, 4, + (__mmask8) __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpneq_epi32_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, + (__v8si) __Y, 4, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmplt_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, + (__v8si) __Y, 1, + (__mmask8) __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmplt_epi32_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, + (__v8si) __Y, 1, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpge_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, + (__v8si) __Y, 5, + (__mmask8) __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpge_epi32_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, + (__v8si) __Y, 5, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmple_epi32_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, + (__v8si) __Y, 2, + (__mmask8) __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmple_epi32_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, + (__v8si) __Y, 2, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpneq_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, + (__v4di) __Y, 4, + (__mmask8) __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpneq_epi64_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, + (__v4di) __Y, 4, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmplt_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, + (__v4di) __Y, 1, + (__mmask8) __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmplt_epi64_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, + (__v4di) __Y, 1, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmpge_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, + (__v4di) __Y, 5, + (__mmask8) __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmpge_epi64_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, + (__v4di) __Y, 5, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmple_epi64_mask (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, + (__v4di) __Y, 2, + (__mmask8) __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmple_epi64_mask (__m256i __X, __m256i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, + (__v4di) __Y, 2, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpneq_epu32_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, + (__v4si) __Y, 4, + (__mmask8) __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_epu32_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, + (__v4si) __Y, 4, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmplt_epu32_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, + (__v4si) __Y, 1, + (__mmask8) __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_epu32_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, + (__v4si) __Y, 1, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpge_epu32_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, + (__v4si) __Y, 5, + (__mmask8) __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_epu32_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, + (__v4si) __Y, 5, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmple_epu32_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, + (__v4si) __Y, 2, + (__mmask8) __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_epu32_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, + (__v4si) __Y, 2, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpneq_epu64_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, + (__v2di) __Y, 4, + (__mmask8) __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_epu64_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, + (__v2di) __Y, 4, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmplt_epu64_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, + (__v2di) __Y, 1, + (__mmask8) __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_epu64_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, + (__v2di) __Y, 1, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpge_epu64_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, + (__v2di) __Y, 5, + (__mmask8) __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_epu64_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, + (__v2di) __Y, 5, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmple_epu64_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, + (__v2di) __Y, 2, + (__mmask8) __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_epu64_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, + (__v2di) __Y, 2, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpneq_epi32_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, + (__v4si) __Y, 4, + (__mmask8) __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_epi32_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, + (__v4si) __Y, 4, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmplt_epi32_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, + (__v4si) __Y, 1, + (__mmask8) __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_epi32_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, + (__v4si) __Y, 1, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpge_epi32_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, + (__v4si) __Y, 5, + (__mmask8) __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_epi32_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, + (__v4si) __Y, 5, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmple_epi32_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, + (__v4si) __Y, 2, + (__mmask8) __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_epi32_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, + (__v4si) __Y, 2, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpneq_epi64_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, + (__v2di) __Y, 4, + (__mmask8) __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_epi64_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, + (__v2di) __Y, 4, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmplt_epi64_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, + (__v2di) __Y, 1, + (__mmask8) __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_epi64_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, + (__v2di) __Y, 1, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmpge_epi64_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, + (__v2di) __Y, 5, + (__mmask8) __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_epi64_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, + (__v2di) __Y, 5, + (__mmask8) -1); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmple_epi64_mask (__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, + (__v2di) __Y, 2, + (__mmask8) __M); +} + +extern __inline __mmask8 + __attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_epi64_mask (__m128i __X, __m128i __Y) +{ + return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, + (__v2di) __Y, 2, + (__mmask8) -1); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutex_epi64 (__m256i __X, const int __I) +{ + return (__m256i) __builtin_ia32_permdi256_mask ((__v4di) __X, + __I, + (__v4di) + _mm256_setzero_si256(), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutex_epi64 (__m256i __W, __mmask8 __M, + __m256i __X, const int __I) +{ + return (__m256i) __builtin_ia32_permdi256_mask ((__v4di) __X, + __I, + (__v4di) __W, + (__mmask8) __M); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutex_epi64 (__mmask8 __M, __m256i __X, const int __I) +{ + return (__m256i) __builtin_ia32_permdi256_mask ((__v4di) __X, + __I, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __M); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shuffle_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B, const int __imm) +{ + return (__m256d) __builtin_ia32_shufpd256_mask ((__v4df) __A, + (__v4df) __B, __imm, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shuffle_pd (__mmask8 __U, __m256d __A, __m256d __B, + const int __imm) +{ + return (__m256d) __builtin_ia32_shufpd256_mask ((__v4df) __A, + (__v4df) __B, __imm, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shuffle_pd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B, const int __imm) +{ + return (__m128d) __builtin_ia32_shufpd128_mask ((__v2df) __A, + (__v2df) __B, __imm, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shuffle_pd (__mmask8 __U, __m128d __A, __m128d __B, + const int __imm) +{ + return (__m128d) __builtin_ia32_shufpd128_mask ((__v2df) __A, + (__v2df) __B, __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shuffle_ps (__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B, const int __imm) +{ + return (__m256) __builtin_ia32_shufps256_mask ((__v8sf) __A, + (__v8sf) __B, __imm, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shuffle_ps (__mmask8 __U, __m256 __A, __m256 __B, + const int __imm) +{ + return (__m256) __builtin_ia32_shufps256_mask ((__v8sf) __A, + (__v8sf) __B, __imm, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shuffle_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B, + const int __imm) +{ + return (__m128) __builtin_ia32_shufps128_mask ((__v4sf) __A, + (__v4sf) __B, __imm, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shuffle_ps (__mmask8 __U, __m128 __A, __m128 __B, + const int __imm) +{ + return (__m128) __builtin_ia32_shufps128_mask ((__v4sf) __A, + (__v4sf) __B, __imm, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_inserti32x4 (__m256i __A, __m128i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si) __A, + (__v4si) __B, + __imm, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_inserti32x4 (__m256i __W, __mmask8 __U, __m256i __A, + __m128i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si) __A, + (__v4si) __B, + __imm, + (__v8si) __W, + (__mmask8) + __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_inserti32x4 (__mmask8 __U, __m256i __A, __m128i __B, + const int __imm) +{ + return (__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si) __A, + (__v4si) __B, + __imm, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) + __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insertf32x4 (__m256 __A, __m128 __B, const int __imm) +{ + return (__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf) __A, + (__v4sf) __B, + __imm, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_insertf32x4 (__m256 __W, __mmask8 __U, __m256 __A, + __m128 __B, const int __imm) +{ + return (__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf) __A, + (__v4sf) __B, + __imm, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_insertf32x4 (__mmask8 __U, __m256 __A, __m128 __B, + const int __imm) +{ + return (__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf) __A, + (__v4sf) __B, + __imm, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extracti32x4_epi32 (__m256i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si) __A, + __imm, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_extracti32x4_epi32 (__m128i __W, __mmask8 __U, __m256i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si) __A, + __imm, + (__v4si) __W, + (__mmask8) + __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_extracti32x4_epi32 (__mmask8 __U, __m256i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si) __A, + __imm, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) + __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extractf32x4_ps (__m256 __A, const int __imm) +{ + return (__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf) __A, + __imm, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_extractf32x4_ps (__m128 __W, __mmask8 __U, __m256 __A, + const int __imm) +{ + return (__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf) __A, + __imm, + (__v4sf) __W, + (__mmask8) + __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_extractf32x4_ps (__mmask8 __U, __m256 __A, + const int __imm) +{ + return (__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf) __A, + __imm, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) + __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shuffle_i64x2 (__m256i __A, __m256i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di) __A, + (__v4di) __B, + __imm, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shuffle_i64x2 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di) __A, + (__v4di) __B, + __imm, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shuffle_i64x2 (__mmask8 __U, __m256i __A, __m256i __B, + const int __imm) +{ + return (__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di) __A, + (__v4di) __B, + __imm, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shuffle_i32x4 (__m256i __A, __m256i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si) __A, + (__v8si) __B, + __imm, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shuffle_i32x4 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si) __A, + (__v8si) __B, + __imm, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shuffle_i32x4 (__mmask8 __U, __m256i __A, __m256i __B, + const int __imm) +{ + return (__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si) __A, + (__v8si) __B, + __imm, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shuffle_f64x2 (__m256d __A, __m256d __B, const int __imm) +{ + return (__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df) __A, + (__v4df) __B, + __imm, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shuffle_f64x2 (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B, const int __imm) +{ + return (__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df) __A, + (__v4df) __B, + __imm, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shuffle_f64x2 (__mmask8 __U, __m256d __A, __m256d __B, + const int __imm) +{ + return (__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df) __A, + (__v4df) __B, + __imm, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shuffle_f32x4 (__m256 __A, __m256 __B, const int __imm) +{ + return (__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf) __A, + (__v8sf) __B, + __imm, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shuffle_f32x4 (__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B, const int __imm) +{ + return (__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf) __A, + (__v8sf) __B, + __imm, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shuffle_f32x4 (__mmask8 __U, __m256 __A, __m256 __B, + const int __imm) +{ + return (__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf) __A, + (__v8sf) __B, + __imm, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fixupimm_pd (__m256d __A, __m256d __B, __m256i __C, + const int __imm) +{ + return (__m256d) __builtin_ia32_fixupimmpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4di) __C, + __imm, + (__mmask8) -1); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fixupimm_pd (__m256d __A, __mmask8 __U, __m256d __B, + __m256i __C, const int __imm) +{ + return (__m256d) __builtin_ia32_fixupimmpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4di) __C, + __imm, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fixupimm_pd (__mmask8 __U, __m256d __A, __m256d __B, + __m256i __C, const int __imm) +{ + return (__m256d) __builtin_ia32_fixupimmpd256_maskz ((__v4df) __A, + (__v4df) __B, + (__v4di) __C, + __imm, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fixupimm_ps (__m256 __A, __m256 __B, __m256i __C, + const int __imm) +{ + return (__m256) __builtin_ia32_fixupimmps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8si) __C, + __imm, + (__mmask8) -1); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_fixupimm_ps (__m256 __A, __mmask8 __U, __m256 __B, + __m256i __C, const int __imm) +{ + return (__m256) __builtin_ia32_fixupimmps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8si) __C, + __imm, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_fixupimm_ps (__mmask8 __U, __m256 __A, __m256 __B, + __m256i __C, const int __imm) +{ + return (__m256) __builtin_ia32_fixupimmps256_maskz ((__v8sf) __A, + (__v8sf) __B, + (__v8si) __C, + __imm, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fixupimm_pd (__m128d __A, __m128d __B, __m128i __C, + const int __imm) +{ + return (__m128d) __builtin_ia32_fixupimmpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, + __imm, + (__mmask8) -1); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fixupimm_pd (__m128d __A, __mmask8 __U, __m128d __B, + __m128i __C, const int __imm) +{ + return (__m128d) __builtin_ia32_fixupimmpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, + __imm, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fixupimm_pd (__mmask8 __U, __m128d __A, __m128d __B, + __m128i __C, const int __imm) +{ + return (__m128d) __builtin_ia32_fixupimmpd128_maskz ((__v2df) __A, + (__v2df) __B, + (__v2di) __C, + __imm, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fixupimm_ps (__m128 __A, __m128 __B, __m128i __C, const int __imm) +{ + return (__m128) __builtin_ia32_fixupimmps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, + __imm, + (__mmask8) -1); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_fixupimm_ps (__m128 __A, __mmask8 __U, __m128 __B, + __m128i __C, const int __imm) +{ + return (__m128) __builtin_ia32_fixupimmps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, + __imm, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_fixupimm_ps (__mmask8 __U, __m128 __A, __m128 __B, + __m128i __C, const int __imm) +{ + return (__m128) __builtin_ia32_fixupimmps128_maskz ((__v4sf) __A, + (__v4sf) __B, + (__v4si) __C, + __imm, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srli_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_psrldi256_mask ((__v8si) __A, __imm, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srli_epi32 (__mmask8 __U, __m256i __A, const int __imm) +{ + return (__m256i) __builtin_ia32_psrldi256_mask ((__v8si) __A, __imm, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srli_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_psrldi128_mask ((__v4si) __A, __imm, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srli_epi32 (__mmask8 __U, __m128i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_psrldi128_mask ((__v4si) __A, __imm, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srli_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_psrlqi256_mask ((__v4di) __A, __imm, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srli_epi64 (__mmask8 __U, __m256i __A, const int __imm) +{ + return (__m256i) __builtin_ia32_psrlqi256_mask ((__v4di) __A, __imm, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srli_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_psrlqi128_mask ((__v2di) __A, __imm, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srli_epi64 (__mmask8 __U, __m128i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_psrlqi128_mask ((__v2di) __A, __imm, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_ternarylogic_epi64 (__m256i __A, __m256i __B, __m256i __C, + const int __imm) +{ + return (__m256i) + __builtin_ia32_pternlogq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __C, + (unsigned char) __imm, + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_ternarylogic_epi64 (__m256i __A, __mmask8 __U, + __m256i __B, __m256i __C, + const int __imm) +{ + return (__m256i) + __builtin_ia32_pternlogq256_mask ((__v4di) __A, + (__v4di) __B, + (__v4di) __C, + (unsigned char) __imm, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_ternarylogic_epi64 (__mmask8 __U, __m256i __A, + __m256i __B, __m256i __C, + const int __imm) +{ + return (__m256i) + __builtin_ia32_pternlogq256_maskz ((__v4di) __A, + (__v4di) __B, + (__v4di) __C, + (unsigned char) __imm, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_ternarylogic_epi32 (__m256i __A, __m256i __B, __m256i __C, + const int __imm) +{ + return (__m256i) + __builtin_ia32_pternlogd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __C, + (unsigned char) __imm, + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_ternarylogic_epi32 (__m256i __A, __mmask8 __U, + __m256i __B, __m256i __C, + const int __imm) +{ + return (__m256i) + __builtin_ia32_pternlogd256_mask ((__v8si) __A, + (__v8si) __B, + (__v8si) __C, + (unsigned char) __imm, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_ternarylogic_epi32 (__mmask8 __U, __m256i __A, + __m256i __B, __m256i __C, + const int __imm) +{ + return (__m256i) + __builtin_ia32_pternlogd256_maskz ((__v8si) __A, + (__v8si) __B, + (__v8si) __C, + (unsigned char) __imm, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ternarylogic_epi64 (__m128i __A, __m128i __B, __m128i __C, + const int __imm) +{ + return (__m128i) + __builtin_ia32_pternlogq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __C, + (unsigned char) __imm, + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_ternarylogic_epi64 (__m128i __A, __mmask8 __U, + __m128i __B, __m128i __C, + const int __imm) +{ + return (__m128i) + __builtin_ia32_pternlogq128_mask ((__v2di) __A, + (__v2di) __B, + (__v2di) __C, + (unsigned char) __imm, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_ternarylogic_epi64 (__mmask8 __U, __m128i __A, + __m128i __B, __m128i __C, + const int __imm) +{ + return (__m128i) + __builtin_ia32_pternlogq128_maskz ((__v2di) __A, + (__v2di) __B, + (__v2di) __C, + (unsigned char) __imm, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ternarylogic_epi32 (__m128i __A, __m128i __B, __m128i __C, + const int __imm) +{ + return (__m128i) + __builtin_ia32_pternlogd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __C, + (unsigned char) __imm, + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_ternarylogic_epi32 (__m128i __A, __mmask8 __U, + __m128i __B, __m128i __C, + const int __imm) +{ + return (__m128i) + __builtin_ia32_pternlogd128_mask ((__v4si) __A, + (__v4si) __B, + (__v4si) __C, + (unsigned char) __imm, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_ternarylogic_epi32 (__mmask8 __U, __m128i __A, + __m128i __B, __m128i __C, + const int __imm) +{ + return (__m128i) + __builtin_ia32_pternlogd128_maskz ((__v4si) __A, + (__v4si) __B, + (__v4si) __C, + (unsigned char) __imm, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_roundscale_ps (__m256 __A, const int __imm) +{ + return (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A, + __imm, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_roundscale_ps (__m256 __W, __mmask8 __U, __m256 __A, + const int __imm) +{ + return (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A, + __imm, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_roundscale_ps (__mmask8 __U, __m256 __A, const int __imm) +{ + return (__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf) __A, + __imm, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_roundscale_pd (__m256d __A, const int __imm) +{ + return (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A, + __imm, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_roundscale_pd (__m256d __W, __mmask8 __U, __m256d __A, + const int __imm) +{ + return (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A, + __imm, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_roundscale_pd (__mmask8 __U, __m256d __A, const int __imm) +{ + return (__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df) __A, + __imm, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roundscale_ps (__m128 __A, const int __imm) +{ + return (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A, + __imm, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_roundscale_ps (__m128 __W, __mmask8 __U, __m128 __A, + const int __imm) +{ + return (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A, + __imm, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_roundscale_ps (__mmask8 __U, __m128 __A, const int __imm) +{ + return (__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf) __A, + __imm, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roundscale_pd (__m128d __A, const int __imm) +{ + return (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A, + __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_roundscale_pd (__m128d __W, __mmask8 __U, __m128d __A, + const int __imm) +{ + return (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A, + __imm, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_roundscale_pd (__mmask8 __U, __m128d __A, const int __imm) +{ + return (__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df) __A, + __imm, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_getmant_ps (__m256 __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m256) __builtin_ia32_getmantps256_mask ((__v8sf) __A, + (__C << 2) | __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_getmant_ps (__m256 __W, __mmask8 __U, __m256 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m256) __builtin_ia32_getmantps256_mask ((__v8sf) __A, + (__C << 2) | __B, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_getmant_ps (__mmask8 __U, __m256 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m256) __builtin_ia32_getmantps256_mask ((__v8sf) __A, + (__C << 2) | __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getmant_ps (__m128 __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m128) __builtin_ia32_getmantps128_mask ((__v4sf) __A, + (__C << 2) | __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getmant_ps (__m128 __W, __mmask8 __U, __m128 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m128) __builtin_ia32_getmantps128_mask ((__v4sf) __A, + (__C << 2) | __B, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getmant_ps (__mmask8 __U, __m128 __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m128) __builtin_ia32_getmantps128_mask ((__v4sf) __A, + (__C << 2) | __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_getmant_pd (__m256d __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m256d) __builtin_ia32_getmantpd256_mask ((__v4df) __A, + (__C << 2) | __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_getmant_pd (__m256d __W, __mmask8 __U, __m256d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m256d) __builtin_ia32_getmantpd256_mask ((__v4df) __A, + (__C << 2) | __B, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_getmant_pd (__mmask8 __U, __m256d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m256d) __builtin_ia32_getmantpd256_mask ((__v4df) __A, + (__C << 2) | __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getmant_pd (__m128d __A, _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m128d) __builtin_ia32_getmantpd128_mask ((__v2df) __A, + (__C << 2) | __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_getmant_pd (__m128d __W, __mmask8 __U, __m128d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m128d) __builtin_ia32_getmantpd128_mask ((__v2df) __A, + (__C << 2) | __B, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_getmant_pd (__mmask8 __U, __m128d __A, + _MM_MANTISSA_NORM_ENUM __B, + _MM_MANTISSA_SIGN_ENUM __C) +{ + return (__m128d) __builtin_ia32_getmantpd128_mask ((__v2df) __A, + (__C << 2) | __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mmask_i32gather_ps (__m256 __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, + int __scale) +{ + return (__m256) __builtin_ia32_gather3siv8sf ((__v8sf) __v1_old, + __addr, + (__v8si) __index, + __mask, __scale); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mmask_i32gather_ps (__m128 __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) +{ + return (__m128) __builtin_ia32_gather3siv4sf ((__v4sf) __v1_old, + __addr, + (__v4si) __index, + __mask, __scale); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mmask_i32gather_pd (__m256d __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) +{ + return (__m256d) __builtin_ia32_gather3siv4df ((__v4df) __v1_old, + __addr, + (__v4si) __index, + __mask, __scale); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mmask_i32gather_pd (__m128d __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) +{ + return (__m128d) __builtin_ia32_gather3siv2df ((__v2df) __v1_old, + __addr, + (__v4si) __index, + __mask, __scale); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mmask_i64gather_ps (__m128 __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, + int __scale) +{ + return (__m128) __builtin_ia32_gather3div8sf ((__v4sf) __v1_old, + __addr, + (__v4di) __index, + __mask, __scale); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mmask_i64gather_ps (__m128 __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) +{ + return (__m128) __builtin_ia32_gather3div4sf ((__v4sf) __v1_old, + __addr, + (__v2di) __index, + __mask, __scale); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mmask_i64gather_pd (__m256d __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, + int __scale) +{ + return (__m256d) __builtin_ia32_gather3div4df ((__v4df) __v1_old, + __addr, + (__v4di) __index, + __mask, __scale); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mmask_i64gather_pd (__m128d __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) +{ + return (__m128d) __builtin_ia32_gather3div2df ((__v2df) __v1_old, + __addr, + (__v2di) __index, + __mask, __scale); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mmask_i32gather_epi32 (__m256i __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, + int __scale) +{ + return (__m256i) __builtin_ia32_gather3siv8si ((__v8si) __v1_old, + __addr, + (__v8si) __index, + __mask, __scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mmask_i32gather_epi32 (__m128i __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) +{ + return (__m128i) __builtin_ia32_gather3siv4si ((__v4si) __v1_old, + __addr, + (__v4si) __index, + __mask, __scale); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mmask_i32gather_epi64 (__m256i __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) +{ + return (__m256i) __builtin_ia32_gather3siv4di ((__v4di) __v1_old, + __addr, + (__v4si) __index, + __mask, __scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mmask_i32gather_epi64 (__m128i __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) +{ + return (__m128i) __builtin_ia32_gather3siv2di ((__v2di) __v1_old, + __addr, + (__v4si) __index, + __mask, __scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mmask_i64gather_epi32 (__m128i __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, + int __scale) +{ + return (__m128i) __builtin_ia32_gather3div8si ((__v4si) __v1_old, + __addr, + (__v4di) __index, + __mask, __scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mmask_i64gather_epi32 (__m128i __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) +{ + return (__m128i) __builtin_ia32_gather3div4si ((__v4si) __v1_old, + __addr, + (__v2di) __index, + __mask, __scale); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mmask_i64gather_epi64 (__m256i __v1_old, __mmask8 __mask, + __m256i __index, void const *__addr, + int __scale) +{ + return (__m256i) __builtin_ia32_gather3div4di ((__v4di) __v1_old, + __addr, + (__v4di) __index, + __mask, __scale); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mmask_i64gather_epi64 (__m128i __v1_old, __mmask8 __mask, + __m128i __index, void const *__addr, + int __scale) +{ + return (__m128i) __builtin_ia32_gather3div2di ((__v2di) __v1_old, + __addr, + (__v2di) __index, + __mask, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i32scatter_ps (void *__addr, __m256i __index, + __m256 __v1, const int __scale) +{ + __builtin_ia32_scattersiv8sf (__addr, (__mmask8) 0xFF, + (__v8si) __index, (__v8sf) __v1, + __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i32scatter_ps (void *__addr, __mmask8 __mask, + __m256i __index, __m256 __v1, + const int __scale) +{ + __builtin_ia32_scattersiv8sf (__addr, __mask, (__v8si) __index, + (__v8sf) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i32scatter_ps (void *__addr, __m128i __index, __m128 __v1, + const int __scale) +{ + __builtin_ia32_scattersiv4sf (__addr, (__mmask8) 0xFF, + (__v4si) __index, (__v4sf) __v1, + __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i32scatter_ps (void *__addr, __mmask8 __mask, + __m128i __index, __m128 __v1, + const int __scale) +{ + __builtin_ia32_scattersiv4sf (__addr, __mask, (__v4si) __index, + (__v4sf) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i32scatter_pd (void *__addr, __m128i __index, + __m256d __v1, const int __scale) +{ + __builtin_ia32_scattersiv4df (__addr, (__mmask8) 0xFF, + (__v4si) __index, (__v4df) __v1, + __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i32scatter_pd (void *__addr, __mmask8 __mask, + __m128i __index, __m256d __v1, + const int __scale) +{ + __builtin_ia32_scattersiv4df (__addr, __mask, (__v4si) __index, + (__v4df) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i32scatter_pd (void *__addr, __m128i __index, + __m128d __v1, const int __scale) +{ + __builtin_ia32_scattersiv2df (__addr, (__mmask8) 0xFF, + (__v4si) __index, (__v2df) __v1, + __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i32scatter_pd (void *__addr, __mmask8 __mask, + __m128i __index, __m128d __v1, + const int __scale) +{ + __builtin_ia32_scattersiv2df (__addr, __mask, (__v4si) __index, + (__v2df) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i64scatter_ps (void *__addr, __m256i __index, + __m128 __v1, const int __scale) +{ + __builtin_ia32_scatterdiv8sf (__addr, (__mmask8) 0xFF, + (__v4di) __index, (__v4sf) __v1, + __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i64scatter_ps (void *__addr, __mmask8 __mask, + __m256i __index, __m128 __v1, + const int __scale) +{ + __builtin_ia32_scatterdiv8sf (__addr, __mask, (__v4di) __index, + (__v4sf) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i64scatter_ps (void *__addr, __m128i __index, __m128 __v1, + const int __scale) +{ + __builtin_ia32_scatterdiv4sf (__addr, (__mmask8) 0xFF, + (__v2di) __index, (__v4sf) __v1, + __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i64scatter_ps (void *__addr, __mmask8 __mask, + __m128i __index, __m128 __v1, + const int __scale) +{ + __builtin_ia32_scatterdiv4sf (__addr, __mask, (__v2di) __index, + (__v4sf) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i64scatter_pd (void *__addr, __m256i __index, + __m256d __v1, const int __scale) +{ + __builtin_ia32_scatterdiv4df (__addr, (__mmask8) 0xFF, + (__v4di) __index, (__v4df) __v1, + __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i64scatter_pd (void *__addr, __mmask8 __mask, + __m256i __index, __m256d __v1, + const int __scale) +{ + __builtin_ia32_scatterdiv4df (__addr, __mask, (__v4di) __index, + (__v4df) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i64scatter_pd (void *__addr, __m128i __index, + __m128d __v1, const int __scale) +{ + __builtin_ia32_scatterdiv2df (__addr, (__mmask8) 0xFF, + (__v2di) __index, (__v2df) __v1, + __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i64scatter_pd (void *__addr, __mmask8 __mask, + __m128i __index, __m128d __v1, + const int __scale) +{ + __builtin_ia32_scatterdiv2df (__addr, __mask, (__v2di) __index, + (__v2df) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i32scatter_epi32 (void *__addr, __m256i __index, + __m256i __v1, const int __scale) +{ + __builtin_ia32_scattersiv8si (__addr, (__mmask8) 0xFF, + (__v8si) __index, (__v8si) __v1, + __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i32scatter_epi32 (void *__addr, __mmask8 __mask, + __m256i __index, __m256i __v1, + const int __scale) +{ + __builtin_ia32_scattersiv8si (__addr, __mask, (__v8si) __index, + (__v8si) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i32scatter_epi32 (void *__addr, __m128i __index, + __m128i __v1, const int __scale) +{ + __builtin_ia32_scattersiv4si (__addr, (__mmask8) 0xFF, + (__v4si) __index, (__v4si) __v1, + __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i32scatter_epi32 (void *__addr, __mmask8 __mask, + __m128i __index, __m128i __v1, + const int __scale) +{ + __builtin_ia32_scattersiv4si (__addr, __mask, (__v4si) __index, + (__v4si) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i32scatter_epi64 (void *__addr, __m128i __index, + __m256i __v1, const int __scale) +{ + __builtin_ia32_scattersiv4di (__addr, (__mmask8) 0xFF, + (__v4si) __index, (__v4di) __v1, + __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i32scatter_epi64 (void *__addr, __mmask8 __mask, + __m128i __index, __m256i __v1, + const int __scale) +{ + __builtin_ia32_scattersiv4di (__addr, __mask, (__v4si) __index, + (__v4di) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i32scatter_epi64 (void *__addr, __m128i __index, + __m128i __v1, const int __scale) +{ + __builtin_ia32_scattersiv2di (__addr, (__mmask8) 0xFF, + (__v4si) __index, (__v2di) __v1, + __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i32scatter_epi64 (void *__addr, __mmask8 __mask, + __m128i __index, __m128i __v1, + const int __scale) +{ + __builtin_ia32_scattersiv2di (__addr, __mask, (__v4si) __index, + (__v2di) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i64scatter_epi32 (void *__addr, __m256i __index, + __m128i __v1, const int __scale) +{ + __builtin_ia32_scatterdiv8si (__addr, (__mmask8) 0xFF, + (__v4di) __index, (__v4si) __v1, + __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i64scatter_epi32 (void *__addr, __mmask8 __mask, + __m256i __index, __m128i __v1, + const int __scale) +{ + __builtin_ia32_scatterdiv8si (__addr, __mask, (__v4di) __index, + (__v4si) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i64scatter_epi32 (void *__addr, __m128i __index, + __m128i __v1, const int __scale) +{ + __builtin_ia32_scatterdiv4si (__addr, (__mmask8) 0xFF, + (__v2di) __index, (__v4si) __v1, + __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i64scatter_epi32 (void *__addr, __mmask8 __mask, + __m128i __index, __m128i __v1, + const int __scale) +{ + __builtin_ia32_scatterdiv4si (__addr, __mask, (__v2di) __index, + (__v4si) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_i64scatter_epi64 (void *__addr, __m256i __index, + __m256i __v1, const int __scale) +{ + __builtin_ia32_scatterdiv4di (__addr, (__mmask8) 0xFF, + (__v4di) __index, (__v4di) __v1, + __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_i64scatter_epi64 (void *__addr, __mmask8 __mask, + __m256i __index, __m256i __v1, + const int __scale) +{ + __builtin_ia32_scatterdiv4di (__addr, __mask, (__v4di) __index, + (__v4di) __v1, __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_i64scatter_epi64 (void *__addr, __m128i __index, + __m128i __v1, const int __scale) +{ + __builtin_ia32_scatterdiv2di (__addr, (__mmask8) 0xFF, + (__v2di) __index, (__v2di) __v1, + __scale); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_i64scatter_epi64 (void *__addr, __mmask8 __mask, + __m128i __index, __m128i __v1, + const int __scale) +{ + __builtin_ia32_scatterdiv2di (__addr, __mask, (__v2di) __index, + (__v2di) __v1, __scale); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_shuffle_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + _MM_PERM_ENUM __mask) +{ + return (__m256i) __builtin_ia32_pshufd256_mask ((__v8si) __A, __mask, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_shuffle_epi32 (__mmask8 __U, __m256i __A, + _MM_PERM_ENUM __mask) +{ + return (__m256i) __builtin_ia32_pshufd256_mask ((__v8si) __A, __mask, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_shuffle_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + _MM_PERM_ENUM __mask) +{ + return (__m128i) __builtin_ia32_pshufd128_mask ((__v4si) __A, __mask, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_shuffle_epi32 (__mmask8 __U, __m128i __A, + _MM_PERM_ENUM __mask) +{ + return (__m128i) __builtin_ia32_pshufd128_mask ((__v4si) __A, __mask, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rol_epi32 (__m256i __A, const int __B) +{ + return (__m256i) __builtin_ia32_prold256_mask ((__v8si) __A, __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rol_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + const int __B) +{ + return (__m256i) __builtin_ia32_prold256_mask ((__v8si) __A, __B, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_rol_epi32 (__mmask8 __U, __m256i __A, const int __B) +{ + return (__m256i) __builtin_ia32_prold256_mask ((__v8si) __A, __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rol_epi32 (__m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_prold128_mask ((__v4si) __A, __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rol_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + const int __B) +{ + return (__m128i) __builtin_ia32_prold128_mask ((__v4si) __A, __B, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rol_epi32 (__mmask8 __U, __m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_prold128_mask ((__v4si) __A, __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_ror_epi32 (__m256i __A, const int __B) +{ + return (__m256i) __builtin_ia32_prord256_mask ((__v8si) __A, __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_ror_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + const int __B) +{ + return (__m256i) __builtin_ia32_prord256_mask ((__v8si) __A, __B, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_ror_epi32 (__mmask8 __U, __m256i __A, const int __B) +{ + return (__m256i) __builtin_ia32_prord256_mask ((__v8si) __A, __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ror_epi32 (__m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_prord128_mask ((__v4si) __A, __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_ror_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + const int __B) +{ + return (__m128i) __builtin_ia32_prord128_mask ((__v4si) __A, __B, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_ror_epi32 (__mmask8 __U, __m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_prord128_mask ((__v4si) __A, __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rol_epi64 (__m256i __A, const int __B) +{ + return (__m256i) __builtin_ia32_prolq256_mask ((__v4di) __A, __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_rol_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + const int __B) +{ + return (__m256i) __builtin_ia32_prolq256_mask ((__v4di) __A, __B, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_rol_epi64 (__mmask8 __U, __m256i __A, const int __B) +{ + return (__m256i) __builtin_ia32_prolq256_mask ((__v4di) __A, __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rol_epi64 (__m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_prolq128_mask ((__v2di) __A, __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_rol_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + const int __B) +{ + return (__m128i) __builtin_ia32_prolq128_mask ((__v2di) __A, __B, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_rol_epi64 (__mmask8 __U, __m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_prolq128_mask ((__v2di) __A, __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_ror_epi64 (__m256i __A, const int __B) +{ + return (__m256i) __builtin_ia32_prorq256_mask ((__v4di) __A, __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_ror_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + const int __B) +{ + return (__m256i) __builtin_ia32_prorq256_mask ((__v4di) __A, __B, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_ror_epi64 (__mmask8 __U, __m256i __A, const int __B) +{ + return (__m256i) __builtin_ia32_prorq256_mask ((__v4di) __A, __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ror_epi64 (__m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_prorq128_mask ((__v2di) __A, __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_ror_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + const int __B) +{ + return (__m128i) __builtin_ia32_prorq128_mask ((__v2di) __A, __B, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_ror_epi64 (__mmask8 __U, __m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_prorq128_mask ((__v2di) __A, __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_alignr_epi32 (__m128i __A, __m128i __B, const int __imm) +{ + return (__m128i) __builtin_ia32_alignd128_mask ((__v4si) __A, + (__v4si) __B, __imm, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_alignr_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B, const int __imm) +{ + return (__m128i) __builtin_ia32_alignd128_mask ((__v4si) __A, + (__v4si) __B, __imm, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_alignr_epi32 (__mmask8 __U, __m128i __A, __m128i __B, + const int __imm) +{ + return (__m128i) __builtin_ia32_alignd128_mask ((__v4si) __A, + (__v4si) __B, __imm, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_alignr_epi64 (__m128i __A, __m128i __B, const int __imm) +{ + return (__m128i) __builtin_ia32_alignq128_mask ((__v2di) __A, + (__v2di) __B, __imm, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_alignr_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B, const int __imm) +{ + return (__m128i) __builtin_ia32_alignq128_mask ((__v2di) __A, + (__v2di) __B, __imm, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_alignr_epi64 (__mmask8 __U, __m128i __A, __m128i __B, + const int __imm) +{ + return (__m128i) __builtin_ia32_alignq128_mask ((__v2di) __A, + (__v2di) __B, __imm, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_alignr_epi32 (__m256i __A, __m256i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_alignd256_mask ((__v8si) __A, + (__v8si) __B, __imm, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_alignr_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_alignd256_mask ((__v8si) __A, + (__v8si) __B, __imm, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_alignr_epi32 (__mmask8 __U, __m256i __A, __m256i __B, + const int __imm) +{ + return (__m256i) __builtin_ia32_alignd256_mask ((__v8si) __A, + (__v8si) __B, __imm, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_alignr_epi64 (__m256i __A, __m256i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_alignq256_mask ((__v4di) __A, + (__v4di) __B, __imm, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_alignr_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + __m256i __B, const int __imm) +{ + return (__m256i) __builtin_ia32_alignq256_mask ((__v4di) __A, + (__v4di) __B, __imm, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_alignr_epi64 (__mmask8 __U, __m256i __A, __m256i __B, + const int __imm) +{ + return (__m256i) __builtin_ia32_alignq256_mask ((__v4di) __A, + (__v4di) __B, __imm, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cvtps_ph (__m128i __W, __mmask8 __U, __m128 __A, + const int __I) +{ + return (__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf) __A, __I, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_cvtps_ph (__mmask8 __U, __m128 __A, const int __I) +{ + return (__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf) __A, __I, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cvtps_ph (__m128i __W, __mmask8 __U, __m256 __A, + const int __I) +{ + return (__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf) __A, __I, + (__v8hi) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_cvtps_ph (__mmask8 __U, __m256 __A, const int __I) +{ + return (__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf) __A, __I, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srai_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_psradi256_mask ((__v8si) __A, __imm, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srai_epi32 (__mmask8 __U, __m256i __A, const int __imm) +{ + return (__m256i) __builtin_ia32_psradi256_mask ((__v8si) __A, __imm, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srai_epi32 (__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_psradi128_mask ((__v4si) __A, __imm, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srai_epi32 (__mmask8 __U, __m128i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_psradi128_mask ((__v4si) __A, __imm, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_srai_epi64 (__m256i __A, const int __imm) +{ + return (__m256i) __builtin_ia32_psraqi256_mask ((__v4di) __A, __imm, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_srai_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + const int __imm) +{ + return (__m256i) __builtin_ia32_psraqi256_mask ((__v4di) __A, __imm, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_srai_epi64 (__mmask8 __U, __m256i __A, const int __imm) +{ + return (__m256i) __builtin_ia32_psraqi256_mask ((__v4di) __A, __imm, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srai_epi64 (__m128i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_psraqi128_mask ((__v2di) __A, __imm, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_srai_epi64 (__m128i __W, __mmask8 __U, __m128i __A, + const int __imm) +{ + return (__m128i) __builtin_ia32_psraqi128_mask ((__v2di) __A, __imm, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_srai_epi64 (__mmask8 __U, __m128i __A, const int __imm) +{ + return (__m128i) __builtin_ia32_psraqi128_mask ((__v2di) __A, __imm, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_slli_epi32 (__m128i __W, __mmask8 __U, __m128i __A, int __B) +{ + return (__m128i) __builtin_ia32_pslldi128_mask ((__v4si) __A, __B, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_slli_epi32 (__mmask8 __U, __m128i __A, int __B) +{ + return (__m128i) __builtin_ia32_pslldi128_mask ((__v4si) __A, __B, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_slli_epi64 (__m128i __W, __mmask8 __U, __m128i __A, int __B) +{ + return (__m128i) __builtin_ia32_psllqi128_mask ((__v2di) __A, __B, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_slli_epi64 (__mmask8 __U, __m128i __A, int __B) +{ + return (__m128i) __builtin_ia32_psllqi128_mask ((__v2di) __A, __B, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_slli_epi32 (__m256i __W, __mmask8 __U, __m256i __A, + int __B) +{ + return (__m256i) __builtin_ia32_pslldi256_mask ((__v8si) __A, __B, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_slli_epi32 (__mmask8 __U, __m256i __A, int __B) +{ + return (__m256i) __builtin_ia32_pslldi256_mask ((__v8si) __A, __B, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_slli_epi64 (__m256i __W, __mmask8 __U, __m256i __A, + int __B) +{ + return (__m256i) __builtin_ia32_psllqi256_mask ((__v4di) __A, __B, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_slli_epi64 (__mmask8 __U, __m256i __A, int __B) +{ + return (__m256i) __builtin_ia32_psllqi256_mask ((__v4di) __A, __B, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permutex_pd (__m256d __W, __mmask8 __U, __m256d __X, + const int __imm) +{ + return (__m256d) __builtin_ia32_permdf256_mask ((__v4df) __X, __imm, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permutex_pd (__mmask8 __U, __m256d __X, const int __imm) +{ + return (__m256d) __builtin_ia32_permdf256_mask ((__v4df) __X, __imm, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permute_pd (__m256d __W, __mmask8 __U, __m256d __X, + const int __C) +{ + return (__m256d) __builtin_ia32_vpermilpd256_mask ((__v4df) __X, __C, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permute_pd (__mmask8 __U, __m256d __X, const int __C) +{ + return (__m256d) __builtin_ia32_vpermilpd256_mask ((__v4df) __X, __C, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_permute_pd (__m128d __W, __mmask8 __U, __m128d __X, + const int __C) +{ + return (__m128d) __builtin_ia32_vpermilpd_mask ((__v2df) __X, __C, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_permute_pd (__mmask8 __U, __m128d __X, const int __C) +{ + return (__m128d) __builtin_ia32_vpermilpd_mask ((__v2df) __X, __C, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_permute_ps (__m256 __W, __mmask8 __U, __m256 __X, + const int __C) +{ + return (__m256) __builtin_ia32_vpermilps256_mask ((__v8sf) __X, __C, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_permute_ps (__mmask8 __U, __m256 __X, const int __C) +{ + return (__m256) __builtin_ia32_vpermilps256_mask ((__v8sf) __X, __C, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_permute_ps (__m128 __W, __mmask8 __U, __m128 __X, + const int __C) +{ + return (__m128) __builtin_ia32_vpermilps_mask ((__v4sf) __X, __C, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_permute_ps (__mmask8 __U, __m128 __X, const int __C) +{ + return (__m128) __builtin_ia32_vpermilps_mask ((__v4sf) __X, __C, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_blend_pd (__mmask8 __U, __m256d __A, __m256d __W) +{ + return (__m256d) __builtin_ia32_blendmpd_256_mask ((__v4df) __A, + (__v4df) __W, + (__mmask8) __U); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_blend_ps (__mmask8 __U, __m256 __A, __m256 __W) +{ + return (__m256) __builtin_ia32_blendmps_256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_blend_epi64 (__mmask8 __U, __m256i __A, __m256i __W) +{ + return (__m256i) __builtin_ia32_blendmq_256_mask ((__v4di) __A, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_blend_epi32 (__mmask8 __U, __m256i __A, __m256i __W) +{ + return (__m256i) __builtin_ia32_blendmd_256_mask ((__v8si) __A, + (__v8si) __W, + (__mmask8) __U); +} + +extern __inline __m128d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_blend_pd (__mmask8 __U, __m128d __A, __m128d __W) +{ + return (__m128d) __builtin_ia32_blendmpd_128_mask ((__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_blend_ps (__mmask8 __U, __m128 __A, __m128 __W) +{ + return (__m128) __builtin_ia32_blendmps_128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_blend_epi64 (__mmask8 __U, __m128i __A, __m128i __W) +{ + return (__m128i) __builtin_ia32_blendmq_128_mask ((__v2di) __A, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_blend_epi32 (__mmask8 __U, __m128i __A, __m128i __W) +{ + return (__m128i) __builtin_ia32_blendmd_128_mask ((__v4si) __A, + (__v4si) __W, + (__mmask8) __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_epi64_mask (__m256i __X, __m256i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, + (__v4di) __Y, __P, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_epi32_mask (__m256i __X, __m256i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, + (__v8si) __Y, __P, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_epu64_mask (__m256i __X, __m256i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, + (__v4di) __Y, __P, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_epu32_mask (__m256i __X, __m256i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, + (__v8si) __Y, __P, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_pd_mask (__m256d __X, __m256d __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmppd256_mask ((__v4df) __X, + (__v4df) __Y, __P, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_ps_mask (__m256 __X, __m256 __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpps256_mask ((__v8sf) __X, + (__v8sf) __Y, __P, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmp_epi64_mask (__mmask8 __U, __m256i __X, __m256i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_cmpq256_mask ((__v4di) __X, + (__v4di) __Y, __P, + (__mmask8) __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmp_epi32_mask (__mmask8 __U, __m256i __X, __m256i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_cmpd256_mask ((__v8si) __X, + (__v8si) __Y, __P, + (__mmask8) __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmp_epu64_mask (__mmask8 __U, __m256i __X, __m256i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di) __X, + (__v4di) __Y, __P, + (__mmask8) __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmp_epu32_mask (__mmask8 __U, __m256i __X, __m256i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si) __X, + (__v8si) __Y, __P, + (__mmask8) __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmp_pd_mask (__mmask8 __U, __m256d __X, __m256d __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_cmppd256_mask ((__v4df) __X, + (__v4df) __Y, __P, + (__mmask8) __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_cmp_ps_mask (__mmask8 __U, __m256 __X, __m256 __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_cmpps256_mask ((__v8sf) __X, + (__v8sf) __Y, __P, + (__mmask8) __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_epi64_mask (__m128i __X, __m128i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, + (__v2di) __Y, __P, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_epi32_mask (__m128i __X, __m128i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, + (__v4si) __Y, __P, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_epu64_mask (__m128i __X, __m128i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, + (__v2di) __Y, __P, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_epu32_mask (__m128i __X, __m128i __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, + (__v4si) __Y, __P, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_pd_mask (__m128d __X, __m128d __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmppd128_mask ((__v2df) __X, + (__v2df) __Y, __P, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_ps_mask (__m128 __X, __m128 __Y, const int __P) +{ + return (__mmask8) __builtin_ia32_cmpps128_mask ((__v4sf) __X, + (__v4sf) __Y, __P, + (__mmask8) -1); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_epi64_mask (__mmask8 __U, __m128i __X, __m128i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_cmpq128_mask ((__v2di) __X, + (__v2di) __Y, __P, + (__mmask8) __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_epi32_mask (__mmask8 __U, __m128i __X, __m128i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_cmpd128_mask ((__v4si) __X, + (__v4si) __Y, __P, + (__mmask8) __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_epu64_mask (__mmask8 __U, __m128i __X, __m128i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di) __X, + (__v2di) __Y, __P, + (__mmask8) __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_epu32_mask (__mmask8 __U, __m128i __X, __m128i __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si) __X, + (__v4si) __Y, __P, + (__mmask8) __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_pd_mask (__mmask8 __U, __m128d __X, __m128d __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_cmppd128_mask ((__v2df) __X, + (__v2df) __Y, __P, + (__mmask8) __U); +} + +extern __inline __mmask8 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_cmp_ps_mask (__mmask8 __U, __m128 __X, __m128 __Y, + const int __P) +{ + return (__mmask8) __builtin_ia32_cmpps128_mask ((__v4sf) __X, + (__v4sf) __Y, __P, + (__mmask8) __U); +} + +extern __inline __m256d +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutex_pd (__m256d __X, const int __M) +{ + return (__m256d) __builtin_ia32_permdf256_mask ((__v4df) __X, __M, + (__v4df) + _mm256_undefined_pd (), + (__mmask8) -1); +} + +#else +#define _mm256_permutex_pd(X, M) \ + ((__m256d) __builtin_ia32_permdf256_mask ((__v4df)(__m256d)(X), (int)(M), \ + (__v4df)(__m256d) \ + _mm256_undefined_pd (), \ + (__mmask8)-1)) + +#define _mm256_permutex_epi64(X, I) \ + ((__m256i) __builtin_ia32_permdi256_mask ((__v4di)(__m256i)(X), \ + (int)(I), \ + (__v4di)(__m256i) \ + (_mm256_setzero_si256 ()),\ + (__mmask8) -1)) + +#define _mm256_maskz_permutex_epi64(M, X, I) \ + ((__m256i) __builtin_ia32_permdi256_mask ((__v4di)(__m256i)(X), \ + (int)(I), \ + (__v4di)(__m256i) \ + (_mm256_setzero_si256 ()),\ + (__mmask8)(M))) + +#define _mm256_mask_permutex_epi64(W, M, X, I) \ + ((__m256i) __builtin_ia32_permdi256_mask ((__v4di)(__m256i)(X), \ + (int)(I), \ + (__v4di)(__m256i)(W), \ + (__mmask8)(M))) + +#define _mm256_insertf32x4(X, Y, C) \ + ((__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf)(__m256) (X), \ + (__v4sf)(__m128) (Y), (int) (C), \ + (__v8sf)(__m256)_mm256_setzero_ps (), \ + (__mmask8)-1)) + +#define _mm256_mask_insertf32x4(W, U, X, Y, C) \ + ((__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf)(__m256) (X), \ + (__v4sf)(__m128) (Y), (int) (C), \ + (__v8sf)(__m256)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_insertf32x4(U, X, Y, C) \ + ((__m256) __builtin_ia32_insertf32x4_256_mask ((__v8sf)(__m256) (X), \ + (__v4sf)(__m128) (Y), (int) (C), \ + (__v8sf)(__m256)_mm256_setzero_ps (), \ + (__mmask8)(U))) + +#define _mm256_inserti32x4(X, Y, C) \ + ((__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si)(__m256i) (X),\ + (__v4si)(__m128i) (Y), (int) (C), \ + (__v8si)(__m256i)_mm256_setzero_si256 (), \ + (__mmask8)-1)) + +#define _mm256_mask_inserti32x4(W, U, X, Y, C) \ + ((__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si)(__m256i) (X),\ + (__v4si)(__m128i) (Y), (int) (C), \ + (__v8si)(__m256i)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_inserti32x4(U, X, Y, C) \ + ((__m256i) __builtin_ia32_inserti32x4_256_mask ((__v8si)(__m256i) (X),\ + (__v4si)(__m128i) (Y), (int) (C), \ + (__v8si)(__m256i)_mm256_setzero_si256 (), \ + (__mmask8)(U))) + +#define _mm256_extractf32x4_ps(X, C) \ + ((__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf)(__m256) (X), \ + (int) (C), \ + (__v4sf)(__m128)_mm_setzero_ps (), \ + (__mmask8)-1)) + +#define _mm256_mask_extractf32x4_ps(W, U, X, C) \ + ((__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf)(__m256) (X), \ + (int) (C), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_extractf32x4_ps(U, X, C) \ + ((__m128) __builtin_ia32_extractf32x4_256_mask ((__v8sf)(__m256) (X), \ + (int) (C), \ + (__v4sf)(__m128)_mm_setzero_ps (), \ + (__mmask8)(U))) + +#define _mm256_extracti32x4_epi32(X, C) \ + ((__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si)(__m256i) (X),\ + (int) (C), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)-1)) + +#define _mm256_mask_extracti32x4_epi32(W, U, X, C) \ + ((__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si)(__m256i) (X),\ + (int) (C), (__v4si)(__m128i)(W), (__mmask8)(U))) + +#define _mm256_maskz_extracti32x4_epi32(U, X, C) \ + ((__m128i) __builtin_ia32_extracti32x4_256_mask ((__v8si)(__m256i) (X),\ + (int) (C), (__v4si)(__m128i)_mm_setzero_si128 (), (__mmask8)(U))) + +#define _mm256_shuffle_i64x2(X, Y, C) \ + ((__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di)(__m256i)(X), \ + (__v4di)(__m256i)(Y), (int)(C), \ + (__v4di)(__m256i)_mm256_setzero_si256 (), \ + (__mmask8)-1)) + +#define _mm256_mask_shuffle_i64x2(W, U, X, Y, C) \ + ((__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di)(__m256i)(X), \ + (__v4di)(__m256i)(Y), (int)(C), \ + (__v4di)(__m256i)(W),\ + (__mmask8)(U))) + +#define _mm256_maskz_shuffle_i64x2(U, X, Y, C) \ + ((__m256i) __builtin_ia32_shuf_i64x2_256_mask ((__v4di)(__m256i)(X), \ + (__v4di)(__m256i)(Y), (int)(C), \ + (__v4di)(__m256i)_mm256_setzero_si256 (), \ + (__mmask8)(U))) + +#define _mm256_shuffle_i32x4(X, Y, C) \ + ((__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si)(__m256i)(X), \ + (__v8si)(__m256i)(Y), (int)(C), \ + (__v8si)(__m256i) \ + _mm256_setzero_si256 (), \ + (__mmask8)-1)) + +#define _mm256_mask_shuffle_i32x4(W, U, X, Y, C) \ + ((__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si)(__m256i)(X), \ + (__v8si)(__m256i)(Y), (int)(C), \ + (__v8si)(__m256i)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_shuffle_i32x4(U, X, Y, C) \ + ((__m256i) __builtin_ia32_shuf_i32x4_256_mask ((__v8si)(__m256i)(X), \ + (__v8si)(__m256i)(Y), (int)(C), \ + (__v8si)(__m256i) \ + _mm256_setzero_si256 (), \ + (__mmask8)(U))) + +#define _mm256_shuffle_f64x2(X, Y, C) \ + ((__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df)(__m256d)(X), \ + (__v4df)(__m256d)(Y), (int)(C), \ + (__v4df)(__m256d)_mm256_setzero_pd (),\ + (__mmask8)-1)) + +#define _mm256_mask_shuffle_f64x2(W, U, X, Y, C) \ + ((__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df)(__m256d)(X), \ + (__v4df)(__m256d)(Y), (int)(C), \ + (__v4df)(__m256d)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_shuffle_f64x2(U, X, Y, C) \ + ((__m256d) __builtin_ia32_shuf_f64x2_256_mask ((__v4df)(__m256d)(X), \ + (__v4df)(__m256d)(Y), (int)(C), \ + (__v4df)(__m256d)_mm256_setzero_pd( ),\ + (__mmask8)(U))) + +#define _mm256_shuffle_f32x4(X, Y, C) \ + ((__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf)(__m256)(X), \ + (__v8sf)(__m256)(Y), (int)(C), \ + (__v8sf)(__m256)_mm256_setzero_ps (), \ + (__mmask8)-1)) + +#define _mm256_mask_shuffle_f32x4(W, U, X, Y, C) \ + ((__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf)(__m256)(X), \ + (__v8sf)(__m256)(Y), (int)(C), \ + (__v8sf)(__m256)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_shuffle_f32x4(U, X, Y, C) \ + ((__m256) __builtin_ia32_shuf_f32x4_256_mask ((__v8sf)(__m256)(X), \ + (__v8sf)(__m256)(Y), (int)(C), \ + (__v8sf)(__m256)_mm256_setzero_ps (), \ + (__mmask8)(U))) + +#define _mm256_mask_shuffle_pd(W, U, A, B, C) \ + ((__m256d)__builtin_ia32_shufpd256_mask ((__v4df)(__m256d)(A), \ + (__v4df)(__m256d)(B), (int)(C), \ + (__v4df)(__m256d)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_shuffle_pd(U, A, B, C) \ + ((__m256d)__builtin_ia32_shufpd256_mask ((__v4df)(__m256d)(A), \ + (__v4df)(__m256d)(B), (int)(C), \ + (__v4df)(__m256d) \ + _mm256_setzero_pd (), \ + (__mmask8)(U))) + +#define _mm_mask_shuffle_pd(W, U, A, B, C) \ + ((__m128d)__builtin_ia32_shufpd128_mask ((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U))) + +#define _mm_maskz_shuffle_pd(U, A, B, C) \ + ((__m128d)__builtin_ia32_shufpd128_mask ((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)(__m128d)_mm_setzero_pd (), \ + (__mmask8)(U))) + +#define _mm256_mask_shuffle_ps(W, U, A, B, C) \ + ((__m256) __builtin_ia32_shufps256_mask ((__v8sf)(__m256)(A), \ + (__v8sf)(__m256)(B), (int)(C), \ + (__v8sf)(__m256)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_shuffle_ps(U, A, B, C) \ + ((__m256) __builtin_ia32_shufps256_mask ((__v8sf)(__m256)(A), \ + (__v8sf)(__m256)(B), (int)(C), \ + (__v8sf)(__m256)_mm256_setzero_ps (),\ + (__mmask8)(U))) + +#define _mm_mask_shuffle_ps(W, U, A, B, C) \ + ((__m128) __builtin_ia32_shufps128_mask ((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U))) + +#define _mm_maskz_shuffle_ps(U, A, B, C) \ + ((__m128) __builtin_ia32_shufps128_mask ((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), \ + (__v4sf)(__m128)_mm_setzero_ps (), \ + (__mmask8)(U))) + +#define _mm256_fixupimm_pd(X, Y, Z, C) \ + ((__m256d)__builtin_ia32_fixupimmpd256_mask ((__v4df)(__m256d)(X), \ + (__v4df)(__m256d)(Y), \ + (__v4di)(__m256i)(Z), (int)(C), \ + (__mmask8)(-1))) + +#define _mm256_mask_fixupimm_pd(X, U, Y, Z, C) \ + ((__m256d)__builtin_ia32_fixupimmpd256_mask ((__v4df)(__m256d)(X), \ + (__v4df)(__m256d)(Y), \ + (__v4di)(__m256i)(Z), (int)(C), \ + (__mmask8)(U))) + +#define _mm256_maskz_fixupimm_pd(U, X, Y, Z, C) \ + ((__m256d)__builtin_ia32_fixupimmpd256_maskz ((__v4df)(__m256d)(X), \ + (__v4df)(__m256d)(Y), \ + (__v4di)(__m256i)(Z), (int)(C),\ + (__mmask8)(U))) + +#define _mm256_fixupimm_ps(X, Y, Z, C) \ + ((__m256)__builtin_ia32_fixupimmps256_mask ((__v8sf)(__m256)(X), \ + (__v8sf)(__m256)(Y), \ + (__v8si)(__m256i)(Z), (int)(C), \ + (__mmask8)(-1))) + + +#define _mm256_mask_fixupimm_ps(X, U, Y, Z, C) \ + ((__m256)__builtin_ia32_fixupimmps256_mask ((__v8sf)(__m256)(X), \ + (__v8sf)(__m256)(Y), \ + (__v8si)(__m256i)(Z), (int)(C), \ + (__mmask8)(U))) + +#define _mm256_maskz_fixupimm_ps(U, X, Y, Z, C) \ + ((__m256)__builtin_ia32_fixupimmps256_maskz ((__v8sf)(__m256)(X), \ + (__v8sf)(__m256)(Y), \ + (__v8si)(__m256i)(Z), (int)(C),\ + (__mmask8)(U))) + +#define _mm_fixupimm_pd(X, Y, Z, C) \ + ((__m128d)__builtin_ia32_fixupimmpd128_mask ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), \ + (__v2di)(__m128i)(Z), (int)(C), \ + (__mmask8)(-1))) + + +#define _mm_mask_fixupimm_pd(X, U, Y, Z, C) \ + ((__m128d)__builtin_ia32_fixupimmpd128_mask ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), \ + (__v2di)(__m128i)(Z), (int)(C), \ + (__mmask8)(U))) + +#define _mm_maskz_fixupimm_pd(U, X, Y, Z, C) \ + ((__m128d)__builtin_ia32_fixupimmpd128_maskz ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), \ + (__v2di)(__m128i)(Z), (int)(C),\ + (__mmask8)(U))) + +#define _mm_fixupimm_ps(X, Y, Z, C) \ + ((__m128)__builtin_ia32_fixupimmps128_mask ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), \ + (__v4si)(__m128i)(Z), (int)(C), \ + (__mmask8)(-1))) + +#define _mm_mask_fixupimm_ps(X, U, Y, Z, C) \ + ((__m128)__builtin_ia32_fixupimmps128_mask ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), \ + (__v4si)(__m128i)(Z), (int)(C),\ + (__mmask8)(U))) + +#define _mm_maskz_fixupimm_ps(U, X, Y, Z, C) \ + ((__m128)__builtin_ia32_fixupimmps128_maskz ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), \ + (__v4si)(__m128i)(Z), (int)(C),\ + (__mmask8)(U))) + +#define _mm256_mask_srli_epi32(W, U, A, B) \ + ((__m256i) __builtin_ia32_psrldi256_mask ((__v8si)(__m256i)(A), \ + (int)(B), (__v8si)(__m256i)(W), (__mmask8)(U))) + +#define _mm256_maskz_srli_epi32(U, A, B) \ + ((__m256i) __builtin_ia32_psrldi256_mask ((__v8si)(__m256i)(A), \ + (int)(B), (__v8si)_mm256_setzero_si256 (), (__mmask8)(U))) + +#define _mm_mask_srli_epi32(W, U, A, B) \ + ((__m128i) __builtin_ia32_psrldi128_mask ((__v4si)(__m128i)(A), \ + (int)(B), (__v4si)(__m128i)(W), (__mmask8)(U))) + +#define _mm_maskz_srli_epi32(U, A, B) \ + ((__m128i) __builtin_ia32_psrldi128_mask ((__v4si)(__m128i)(A), \ + (int)(B), (__v4si)_mm_setzero_si128 (), (__mmask8)(U))) + +#define _mm256_mask_srli_epi64(W, U, A, B) \ + ((__m256i) __builtin_ia32_psrlqi256_mask ((__v4di)(__m256i)(A), \ + (int)(B), (__v4di)(__m256i)(W), (__mmask8)(U))) + +#define _mm256_maskz_srli_epi64(U, A, B) \ + ((__m256i) __builtin_ia32_psrlqi256_mask ((__v4di)(__m256i)(A), \ + (int)(B), (__v4di)_mm256_setzero_si256 (), (__mmask8)(U))) + +#define _mm_mask_srli_epi64(W, U, A, B) \ + ((__m128i) __builtin_ia32_psrlqi128_mask ((__v2di)(__m128i)(A), \ + (int)(B), (__v2di)(__m128i)(W), (__mmask8)(U))) + +#define _mm_maskz_srli_epi64(U, A, B) \ + ((__m128i) __builtin_ia32_psrlqi128_mask ((__v2di)(__m128i)(A), \ + (int)(B), (__v2di)_mm_setzero_si128 (), (__mmask8)(U))) + +#define _mm256_mask_slli_epi32(W, U, X, C) \ + ((__m256i)__builtin_ia32_pslldi256_mask ((__v8si)(__m256i)(X), (int)(C),\ + (__v8si)(__m256i)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_slli_epi32(U, X, C) \ + ((__m256i)__builtin_ia32_pslldi256_mask ((__v8si)(__m256i)(X), (int)(C),\ + (__v8si)(__m256i)_mm256_setzero_si256 (), \ + (__mmask8)(U))) + +#define _mm256_mask_slli_epi64(W, U, X, C) \ + ((__m256i)__builtin_ia32_psllqi256_mask ((__v4di)(__m256i)(X), (int)(C),\ + (__v4di)(__m256i)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_slli_epi64(U, X, C) \ + ((__m256i)__builtin_ia32_psllqi256_mask ((__v4di)(__m256i)(X), (int)(C),\ + (__v4di)(__m256i)_mm256_setzero_si256 (), \ + (__mmask8)(U))) + +#define _mm_mask_slli_epi32(W, U, X, C) \ + ((__m128i)__builtin_ia32_pslldi128_mask ((__v4si)(__m128i)(X), (int)(C),\ + (__v4si)(__m128i)(W),\ + (__mmask8)(U))) + +#define _mm_maskz_slli_epi32(U, X, C) \ + ((__m128i)__builtin_ia32_pslldi128_mask ((__v4si)(__m128i)(X), (int)(C),\ + (__v4si)(__m128i)_mm_setzero_si128 (),\ + (__mmask8)(U))) + +#define _mm_mask_slli_epi64(W, U, X, C) \ + ((__m128i)__builtin_ia32_psllqi128_mask ((__v2di)(__m128i)(X), (int)(C),\ + (__v2di)(__m128i)(W),\ + (__mmask8)(U))) + +#define _mm_maskz_slli_epi64(U, X, C) \ + ((__m128i)__builtin_ia32_psllqi128_mask ((__v2di)(__m128i)(X), (int)(C),\ + (__v2di)(__m128i)_mm_setzero_si128 (),\ + (__mmask8)(U))) + +#define _mm256_ternarylogic_epi64(A, B, C, I) \ + ((__m256i) \ + __builtin_ia32_pternlogq256_mask ((__v4di) (__m256i) (A), \ + (__v4di) (__m256i) (B), \ + (__v4di) (__m256i) (C), \ + (unsigned char) (I), \ + (__mmask8) -1)) + +#define _mm256_mask_ternarylogic_epi64(A, U, B, C, I) \ + ((__m256i) \ + __builtin_ia32_pternlogq256_mask ((__v4di) (__m256i) (A), \ + (__v4di) (__m256i) (B), \ + (__v4di) (__m256i) (C), \ + (unsigned char) (I), \ + (__mmask8) (U))) + +#define _mm256_maskz_ternarylogic_epi64(U, A, B, C, I) \ + ((__m256i) \ + __builtin_ia32_pternlogq256_maskz ((__v4di) (__m256i) (A), \ + (__v4di) (__m256i) (B), \ + (__v4di) (__m256i) (C), \ + (unsigned char) (I), \ + (__mmask8) (U))) + +#define _mm256_ternarylogic_epi32(A, B, C, I) \ + ((__m256i) \ + __builtin_ia32_pternlogd256_mask ((__v8si) (__m256i) (A), \ + (__v8si) (__m256i) (B), \ + (__v8si) (__m256i) (C), \ + (unsigned char) (I), \ + (__mmask8) -1)) + +#define _mm256_mask_ternarylogic_epi32(A, U, B, C, I) \ + ((__m256i) \ + __builtin_ia32_pternlogd256_mask ((__v8si) (__m256i) (A), \ + (__v8si) (__m256i) (B), \ + (__v8si) (__m256i) (C), \ + (unsigned char) (I), \ + (__mmask8) (U))) + +#define _mm256_maskz_ternarylogic_epi32(U, A, B, C, I) \ + ((__m256i) \ + __builtin_ia32_pternlogd256_maskz ((__v8si) (__m256i) (A), \ + (__v8si) (__m256i) (B), \ + (__v8si) (__m256i) (C), \ + (unsigned char) (I), \ + (__mmask8) (U))) + +#define _mm_ternarylogic_epi64(A, B, C, I) \ + ((__m128i) \ + __builtin_ia32_pternlogq128_mask ((__v2di) (__m128i) (A), \ + (__v2di) (__m128i) (B), \ + (__v2di) (__m128i) (C), \ + (unsigned char) (I), \ + (__mmask8) -1)) + +#define _mm_mask_ternarylogic_epi64(A, U, B, C, I) \ + ((__m128i) \ + __builtin_ia32_pternlogq128_mask ((__v2di) (__m128i) (A), \ + (__v2di) (__m128i) (B), \ + (__v2di) (__m128i) (C), \ + (unsigned char) (I), \ + (__mmask8) (U))) + +#define _mm_maskz_ternarylogic_epi64(U, A, B, C, I) \ + ((__m128i) \ + __builtin_ia32_pternlogq128_maskz ((__v2di) (__m128i) (A), \ + (__v2di) (__m128i) (B), \ + (__v2di) (__m128i) (C), \ + (unsigned char) (I), \ + (__mmask8) (U))) + +#define _mm_ternarylogic_epi32(A, B, C, I) \ + ((__m128i) \ + __builtin_ia32_pternlogd128_mask ((__v4si) (__m128i) (A), \ + (__v4si) (__m128i) (B), \ + (__v4si) (__m128i) (C), \ + (unsigned char) (I), \ + (__mmask8) -1)) + +#define _mm_mask_ternarylogic_epi32(A, U, B, C, I) \ + ((__m128i) \ + __builtin_ia32_pternlogd128_mask ((__v4si) (__m128i) (A), \ + (__v4si) (__m128i) (B), \ + (__v4si) (__m128i) (C), \ + (unsigned char) (I), \ + (__mmask8) (U))) + +#define _mm_maskz_ternarylogic_epi32(U, A, B, C, I) \ + ((__m128i) \ + __builtin_ia32_pternlogd128_maskz ((__v4si) (__m128i) (A), \ + (__v4si) (__m128i) (B), \ + (__v4si) (__m128i) (C), \ + (unsigned char) (I), \ + (__mmask8) (U))) + +#define _mm256_roundscale_ps(A, B) \ + ((__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf)(__m256)(A), \ + (int)(B), (__v8sf)(__m256)_mm256_setzero_ps (), (__mmask8)-1)) + +#define _mm256_mask_roundscale_ps(W, U, A, B) \ + ((__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf)(__m256)(A), \ + (int)(B), (__v8sf)(__m256)(W), (__mmask8)(U))) + +#define _mm256_maskz_roundscale_ps(U, A, B) \ + ((__m256) __builtin_ia32_rndscaleps_256_mask ((__v8sf)(__m256)(A), \ + (int)(B), (__v8sf)(__m256)_mm256_setzero_ps (), (__mmask8)(U))) + +#define _mm256_roundscale_pd(A, B) \ + ((__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df)(__m256d)(A), \ + (int)(B), (__v4df)(__m256d)_mm256_setzero_pd (), (__mmask8)-1)) + +#define _mm256_mask_roundscale_pd(W, U, A, B) \ + ((__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df)(__m256d)(A), \ + (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U))) + +#define _mm256_maskz_roundscale_pd(U, A, B) \ + ((__m256d) __builtin_ia32_rndscalepd_256_mask ((__v4df)(__m256d)(A), \ + (int)(B), (__v4df)(__m256d)_mm256_setzero_pd (), (__mmask8)(U))) + +#define _mm_roundscale_ps(A, B) \ + ((__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf)(__m128)(A), \ + (int)(B), (__v4sf)(__m128)_mm_setzero_ps (), (__mmask8)-1)) + +#define _mm_mask_roundscale_ps(W, U, A, B) \ + ((__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf)(__m128)(A), \ + (int)(B), (__v4sf)(__m128)(W), (__mmask8)(U))) + +#define _mm_maskz_roundscale_ps(U, A, B) \ + ((__m128) __builtin_ia32_rndscaleps_128_mask ((__v4sf)(__m128)(A), \ + (int)(B), (__v4sf)(__m128)_mm_setzero_ps (), (__mmask8)(U))) + +#define _mm_roundscale_pd(A, B) \ + ((__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df)(__m128d)(A), \ + (int)(B), (__v2df)(__m128d)_mm_setzero_pd (), (__mmask8)-1)) + +#define _mm_mask_roundscale_pd(W, U, A, B) \ + ((__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df)(__m128d)(A), \ + (int)(B), (__v2df)(__m128d)(W), (__mmask8)(U))) + +#define _mm_maskz_roundscale_pd(U, A, B) \ + ((__m128d) __builtin_ia32_rndscalepd_128_mask ((__v2df)(__m128d)(A), \ + (int)(B), (__v2df)(__m128d)_mm_setzero_pd (), (__mmask8)(U))) + +#define _mm256_getmant_ps(X, B, C) \ + ((__m256) __builtin_ia32_getmantps256_mask ((__v8sf)(__m256) (X), \ + (int)(((C)<<2) | (B)), \ + (__v8sf)(__m256)_mm256_setzero_ps (), \ + (__mmask8)-1)) + +#define _mm256_mask_getmant_ps(W, U, X, B, C) \ + ((__m256) __builtin_ia32_getmantps256_mask ((__v8sf)(__m256) (X), \ + (int)(((C)<<2) | (B)), \ + (__v8sf)(__m256)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_getmant_ps(U, X, B, C) \ + ((__m256) __builtin_ia32_getmantps256_mask ((__v8sf)(__m256) (X), \ + (int)(((C)<<2) | (B)), \ + (__v8sf)(__m256)_mm256_setzero_ps (), \ + (__mmask8)(U))) + +#define _mm_getmant_ps(X, B, C) \ + ((__m128) __builtin_ia32_getmantps128_mask ((__v4sf)(__m128) (X), \ + (int)(((C)<<2) | (B)), \ + (__v4sf)(__m128)_mm_setzero_ps (), \ + (__mmask8)-1)) + +#define _mm_mask_getmant_ps(W, U, X, B, C) \ + ((__m128) __builtin_ia32_getmantps128_mask ((__v4sf)(__m128) (X), \ + (int)(((C)<<2) | (B)), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U))) + +#define _mm_maskz_getmant_ps(U, X, B, C) \ + ((__m128) __builtin_ia32_getmantps128_mask ((__v4sf)(__m128) (X), \ + (int)(((C)<<2) | (B)), \ + (__v4sf)(__m128)_mm_setzero_ps (), \ + (__mmask8)(U))) + +#define _mm256_getmant_pd(X, B, C) \ + ((__m256d) __builtin_ia32_getmantpd256_mask ((__v4df)(__m256d) (X), \ + (int)(((C)<<2) | (B)), \ + (__v4df)(__m256d)_mm256_setzero_pd (),\ + (__mmask8)-1)) + +#define _mm256_mask_getmant_pd(W, U, X, B, C) \ + ((__m256d) __builtin_ia32_getmantpd256_mask ((__v4df)(__m256d) (X), \ + (int)(((C)<<2) | (B)), \ + (__v4df)(__m256d)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_getmant_pd(U, X, B, C) \ + ((__m256d) __builtin_ia32_getmantpd256_mask ((__v4df)(__m256d) (X), \ + (int)(((C)<<2) | (B)), \ + (__v4df)(__m256d)_mm256_setzero_pd (),\ + (__mmask8)(U))) + +#define _mm_getmant_pd(X, B, C) \ + ((__m128d) __builtin_ia32_getmantpd128_mask ((__v2df)(__m128d) (X), \ + (int)(((C)<<2) | (B)), \ + (__v2df)(__m128d)_mm_setzero_pd (), \ + (__mmask8)-1)) + +#define _mm_mask_getmant_pd(W, U, X, B, C) \ + ((__m128d) __builtin_ia32_getmantpd128_mask ((__v2df)(__m128d) (X), \ + (int)(((C)<<2) | (B)), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U))) + +#define _mm_maskz_getmant_pd(U, X, B, C) \ + ((__m128d) __builtin_ia32_getmantpd128_mask ((__v2df)(__m128d) (X), \ + (int)(((C)<<2) | (B)), \ + (__v2df)(__m128d)_mm_setzero_pd (), \ + (__mmask8)(U))) + +#define _mm256_mmask_i32gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m256) __builtin_ia32_gather3siv8sf ((__v8sf)(__m256) (V1OLD), \ + (void const *) (ADDR), \ + (__v8si)(__m256i) (INDEX), \ + (__mmask8) (MASK), \ + (int) (SCALE)) + +#define _mm_mmask_i32gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m128) __builtin_ia32_gather3siv4sf ((__v4sf)(__m128) (V1OLD), \ + (void const *) (ADDR), \ + (__v4si)(__m128i) (INDEX), \ + (__mmask8) (MASK), \ + (int) (SCALE)) + +#define _mm256_mmask_i32gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m256d) __builtin_ia32_gather3siv4df ((__v4df)(__m256d) (V1OLD), \ + (void const *) (ADDR), \ + (__v4si)(__m128i) (INDEX), \ + (__mmask8) (MASK), \ + (int) (SCALE)) + +#define _mm_mmask_i32gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m128d) __builtin_ia32_gather3siv2df ((__v2df)(__m128d) (V1OLD), \ + (void const *) (ADDR), \ + (__v4si)(__m128i) (INDEX), \ + (__mmask8) (MASK), \ + (int) (SCALE)) + +#define _mm256_mmask_i64gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m128) __builtin_ia32_gather3div8sf ((__v4sf)(__m128) (V1OLD), \ + (void const *) (ADDR), \ + (__v4di)(__m256i) (INDEX), \ + (__mmask8) (MASK), \ + (int) (SCALE)) + +#define _mm_mmask_i64gather_ps(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m128) __builtin_ia32_gather3div4sf ((__v4sf)(__m128) (V1OLD), \ + (void const *) (ADDR), \ + (__v2di)(__m128i) (INDEX), \ + (__mmask8) (MASK), \ + (int) (SCALE)) + +#define _mm256_mmask_i64gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m256d) __builtin_ia32_gather3div4df ((__v4df)(__m256d) (V1OLD), \ + (void const *) (ADDR), \ + (__v4di)(__m256i) (INDEX), \ + (__mmask8) (MASK), \ + (int) (SCALE)) + +#define _mm_mmask_i64gather_pd(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m128d) __builtin_ia32_gather3div2df ((__v2df)(__m128d) (V1OLD), \ + (void const *) (ADDR), \ + (__v2di)(__m128i) (INDEX), \ + (__mmask8) (MASK), \ + (int) (SCALE)) + +#define _mm256_mmask_i32gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m256i) __builtin_ia32_gather3siv8si ((__v8si)(__m256i) (V1OLD), \ + (void const *) (ADDR), \ + (__v8si)(__m256i) (INDEX), \ + (__mmask8) (MASK), \ + (int) (SCALE)) + +#define _mm_mmask_i32gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m128i) __builtin_ia32_gather3siv4si ((__v4si)(__m128i) (V1OLD), \ + (void const *) (ADDR), \ + (__v4si)(__m128i) (INDEX), \ + (__mmask8) (MASK), \ + (int) (SCALE)) + +#define _mm256_mmask_i32gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m256i) __builtin_ia32_gather3siv4di ((__v4di)(__m256i) (V1OLD), \ + (void const *) (ADDR), \ + (__v4si)(__m128i) (INDEX), \ + (__mmask8) (MASK), \ + (int) (SCALE)) + +#define _mm_mmask_i32gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m128i) __builtin_ia32_gather3siv2di ((__v2di)(__m128i) (V1OLD), \ + (void const *) (ADDR), \ + (__v4si)(__m128i) (INDEX), \ + (__mmask8) (MASK), \ + (int) (SCALE)) + +#define _mm256_mmask_i64gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m128i) __builtin_ia32_gather3div8si ((__v4si)(__m128i) (V1OLD), \ + (void const *) (ADDR), \ + (__v4di)(__m256i) (INDEX), \ + (__mmask8) (MASK), \ + (int) (SCALE)) + +#define _mm_mmask_i64gather_epi32(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m128i) __builtin_ia32_gather3div4si ((__v4si)(__m128i) (V1OLD), \ + (void const *) (ADDR), \ + (__v2di)(__m128i) (INDEX), \ + (__mmask8) (MASK), \ + (int) (SCALE)) + +#define _mm256_mmask_i64gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m256i) __builtin_ia32_gather3div4di ((__v4di)(__m256i) (V1OLD), \ + (void const *) (ADDR), \ + (__v4di)(__m256i) (INDEX), \ + (__mmask8) (MASK), \ + (int) (SCALE)) + +#define _mm_mmask_i64gather_epi64(V1OLD, MASK, INDEX, ADDR, SCALE) \ + (__m128i) __builtin_ia32_gather3div2di ((__v2di)(__m128i) (V1OLD), \ + (void const *) (ADDR), \ + (__v2di)(__m128i) (INDEX), \ + (__mmask8) (MASK), \ + (int) (SCALE)) + +#define _mm256_i32scatter_ps(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv8sf ((void *) (ADDR), (__mmask8)0xFF, \ + (__v8si)(__m256i) (INDEX), \ + (__v8sf)(__m256) (V1), (int) (SCALE)) + +#define _mm256_mask_i32scatter_ps(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv8sf ((void *) (ADDR), (__mmask8) (MASK), \ + (__v8si)(__m256i) (INDEX), \ + (__v8sf)(__m256) (V1), (int) (SCALE)) + +#define _mm_i32scatter_ps(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv4sf ((void *) (ADDR), (__mmask8)0xFF, \ + (__v4si)(__m128i) (INDEX), \ + (__v4sf)(__m128) (V1), (int) (SCALE)) + +#define _mm_mask_i32scatter_ps(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv4sf ((void *) (ADDR), (__mmask8) (MASK), \ + (__v4si)(__m128i) (INDEX), \ + (__v4sf)(__m128) (V1), (int) (SCALE)) + +#define _mm256_i32scatter_pd(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv4df ((void *) (ADDR), (__mmask8)0xFF, \ + (__v4si)(__m128i) (INDEX), \ + (__v4df)(__m256d) (V1), (int) (SCALE)) + +#define _mm256_mask_i32scatter_pd(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv4df ((void *) (ADDR), (__mmask8) (MASK), \ + (__v4si)(__m128i) (INDEX), \ + (__v4df)(__m256d) (V1), (int) (SCALE)) + +#define _mm_i32scatter_pd(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv2df ((void *) (ADDR), (__mmask8)0xFF, \ + (__v4si)(__m128i) (INDEX), \ + (__v2df)(__m128d) (V1), (int) (SCALE)) + +#define _mm_mask_i32scatter_pd(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv2df ((void *) (ADDR), (__mmask8) (MASK), \ + (__v4si)(__m128i) (INDEX), \ + (__v2df)(__m128d) (V1), (int) (SCALE)) + +#define _mm256_i64scatter_ps(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv8sf ((void *) (ADDR), (__mmask8)0xFF, \ + (__v4di)(__m256i) (INDEX), \ + (__v4sf)(__m128) (V1), (int) (SCALE)) + +#define _mm256_mask_i64scatter_ps(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv8sf ((void *) (ADDR), (__mmask8) (MASK), \ + (__v4di)(__m256i) (INDEX), \ + (__v4sf)(__m128) (V1), (int) (SCALE)) + +#define _mm_i64scatter_ps(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv4sf ((void *) (ADDR), (__mmask8)0xFF, \ + (__v2di)(__m128i) (INDEX), \ + (__v4sf)(__m128) (V1), (int) (SCALE)) + +#define _mm_mask_i64scatter_ps(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv4sf ((void *) (ADDR), (__mmask8) (MASK), \ + (__v2di)(__m128i) (INDEX), \ + (__v4sf)(__m128) (V1), (int) (SCALE)) + +#define _mm256_i64scatter_pd(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv4df ((void *) (ADDR), (__mmask8)0xFF, \ + (__v4di)(__m256i) (INDEX), \ + (__v4df)(__m256d) (V1), (int) (SCALE)) + +#define _mm256_mask_i64scatter_pd(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv4df ((void *) (ADDR), (__mmask8) (MASK), \ + (__v4di)(__m256i) (INDEX), \ + (__v4df)(__m256d) (V1), (int) (SCALE)) + +#define _mm_i64scatter_pd(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv2df ((void *) (ADDR), (__mmask8)0xFF, \ + (__v2di)(__m128i) (INDEX), \ + (__v2df)(__m128d) (V1), (int) (SCALE)) + +#define _mm_mask_i64scatter_pd(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv2df ((void *) (ADDR), (__mmask8) (MASK), \ + (__v2di)(__m128i) (INDEX), \ + (__v2df)(__m128d) (V1), (int) (SCALE)) + +#define _mm256_i32scatter_epi32(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv8si ((void *) (ADDR), (__mmask8)0xFF, \ + (__v8si)(__m256i) (INDEX), \ + (__v8si)(__m256i) (V1), (int) (SCALE)) + +#define _mm256_mask_i32scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv8si ((void *) (ADDR), (__mmask8) (MASK), \ + (__v8si)(__m256i) (INDEX), \ + (__v8si)(__m256i) (V1), (int) (SCALE)) + +#define _mm_i32scatter_epi32(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv4si ((void *) (ADDR), (__mmask8)0xFF, \ + (__v4si)(__m128i) (INDEX), \ + (__v4si)(__m128i) (V1), (int) (SCALE)) + +#define _mm_mask_i32scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv4si ((void *) (ADDR), (__mmask8) (MASK), \ + (__v4si)(__m128i) (INDEX), \ + (__v4si)(__m128i) (V1), (int) (SCALE)) + +#define _mm256_i32scatter_epi64(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv4di ((void *) (ADDR), (__mmask8)0xFF, \ + (__v4si)(__m128i) (INDEX), \ + (__v4di)(__m256i) (V1), (int) (SCALE)) + +#define _mm256_mask_i32scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv4di ((void *) (ADDR), (__mmask8) (MASK), \ + (__v4si)(__m128i) (INDEX), \ + (__v4di)(__m256i) (V1), (int) (SCALE)) + +#define _mm_i32scatter_epi64(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv2di ((void *) (ADDR), (__mmask8)0xFF, \ + (__v4si)(__m128i) (INDEX), \ + (__v2di)(__m128i) (V1), (int) (SCALE)) + +#define _mm_mask_i32scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scattersiv2di ((void *) (ADDR), (__mmask8) (MASK), \ + (__v4si)(__m128i) (INDEX), \ + (__v2di)(__m128i) (V1), (int) (SCALE)) + +#define _mm256_i64scatter_epi32(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv8si ((void *) (ADDR), (__mmask8)0xFF, \ + (__v4di)(__m256i) (INDEX), \ + (__v4si)(__m128i) (V1), (int) (SCALE)) + +#define _mm256_mask_i64scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv8si ((void *) (ADDR), (__mmask8) (MASK), \ + (__v4di)(__m256i) (INDEX), \ + (__v4si)(__m128i) (V1), (int) (SCALE)) + +#define _mm_i64scatter_epi32(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv4si ((void *) (ADDR), (__mmask8)0xFF, \ + (__v2di)(__m128i) (INDEX), \ + (__v4si)(__m128i) (V1), (int) (SCALE)) + +#define _mm_mask_i64scatter_epi32(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv4si ((void *) (ADDR), (__mmask8) (MASK), \ + (__v2di)(__m128i) (INDEX), \ + (__v4si)(__m128i) (V1), (int) (SCALE)) + +#define _mm256_i64scatter_epi64(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv4di ((void *) (ADDR), (__mmask8)0xFF, \ + (__v4di)(__m256i) (INDEX), \ + (__v4di)(__m256i) (V1), (int) (SCALE)) + +#define _mm256_mask_i64scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv4di ((void *) (ADDR), (__mmask8) (MASK), \ + (__v4di)(__m256i) (INDEX), \ + (__v4di)(__m256i) (V1), (int) (SCALE)) + +#define _mm_i64scatter_epi64(ADDR, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv2di ((void *) (ADDR), (__mmask8)0xFF, \ + (__v2di)(__m128i) (INDEX), \ + (__v2di)(__m128i) (V1), (int) (SCALE)) + +#define _mm_mask_i64scatter_epi64(ADDR, MASK, INDEX, V1, SCALE) \ + __builtin_ia32_scatterdiv2di ((void *) (ADDR), (__mmask8) (MASK), \ + (__v2di)(__m128i) (INDEX), \ + (__v2di)(__m128i) (V1), (int) (SCALE)) + +#define _mm256_mask_shuffle_epi32(W, U, X, C) \ + ((__m256i) __builtin_ia32_pshufd256_mask ((__v8si)(__m256i)(X), (int)(C), \ + (__v8si)(__m256i)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_shuffle_epi32(U, X, C) \ + ((__m256i) __builtin_ia32_pshufd256_mask ((__v8si)(__m256i)(X), (int)(C), \ + (__v8si)(__m256i) \ + _mm256_setzero_si256 (), \ + (__mmask8)(U))) + +#define _mm_mask_shuffle_epi32(W, U, X, C) \ + ((__m128i) __builtin_ia32_pshufd128_mask ((__v4si)(__m128i)(X), (int)(C), \ + (__v4si)(__m128i)(W), \ + (__mmask8)(U))) + +#define _mm_maskz_shuffle_epi32(U, X, C) \ + ((__m128i) __builtin_ia32_pshufd128_mask ((__v4si)(__m128i)(X), (int)(C), \ + (__v4si)(__m128i)_mm_setzero_si128 (), \ + (__mmask8)(U))) + +#define _mm256_rol_epi64(A, B) \ + ((__m256i)__builtin_ia32_prolq256_mask ((__v4di)(__m256i)(A), (int)(B), \ + (__v4di)(__m256i)_mm256_setzero_si256 (),\ + (__mmask8)-1)) + +#define _mm256_mask_rol_epi64(W, U, A, B) \ + ((__m256i)__builtin_ia32_prolq256_mask ((__v4di)(__m256i)(A), (int)(B), \ + (__v4di)(__m256i)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_rol_epi64(U, A, B) \ + ((__m256i)__builtin_ia32_prolq256_mask ((__v4di)(__m256i)(A), (int)(B), \ + (__v4di)(__m256i)_mm256_setzero_si256 (),\ + (__mmask8)(U))) + +#define _mm_rol_epi64(A, B) \ + ((__m128i)__builtin_ia32_prolq128_mask ((__v2di)(__m128i)(A), (int)(B), \ + (__v2di)(__m128i)_mm_setzero_si128 (),\ + (__mmask8)-1)) + +#define _mm_mask_rol_epi64(W, U, A, B) \ + ((__m128i)__builtin_ia32_prolq128_mask ((__v2di)(__m128i)(A), (int)(B), \ + (__v2di)(__m128i)(W), \ + (__mmask8)(U))) + +#define _mm_maskz_rol_epi64(U, A, B) \ + ((__m128i)__builtin_ia32_prolq128_mask ((__v2di)(__m128i)(A), (int)(B), \ + (__v2di)(__m128i)_mm_setzero_si128 (),\ + (__mmask8)(U))) + +#define _mm256_ror_epi64(A, B) \ + ((__m256i)__builtin_ia32_prorq256_mask ((__v4di)(__m256i)(A), (int)(B), \ + (__v4di)(__m256i)_mm256_setzero_si256 (),\ + (__mmask8)-1)) + +#define _mm256_mask_ror_epi64(W, U, A, B) \ + ((__m256i)__builtin_ia32_prorq256_mask ((__v4di)(__m256i)(A), (int)(B), \ + (__v4di)(__m256i)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_ror_epi64(U, A, B) \ + ((__m256i)__builtin_ia32_prorq256_mask ((__v4di)(__m256i)(A), (int)(B), \ + (__v4di)(__m256i)_mm256_setzero_si256 (),\ + (__mmask8)(U))) + +#define _mm_ror_epi64(A, B) \ + ((__m128i)__builtin_ia32_prorq128_mask ((__v2di)(__m128i)(A), (int)(B), \ + (__v2di)(__m128i)_mm_setzero_si128 (),\ + (__mmask8)-1)) + +#define _mm_mask_ror_epi64(W, U, A, B) \ + ((__m128i)__builtin_ia32_prorq128_mask ((__v2di)(__m128i)(A), (int)(B), \ + (__v2di)(__m128i)(W), \ + (__mmask8)(U))) + +#define _mm_maskz_ror_epi64(U, A, B) \ + ((__m128i)__builtin_ia32_prorq128_mask ((__v2di)(__m128i)(A), (int)(B), \ + (__v2di)(__m128i)_mm_setzero_si128 (),\ + (__mmask8)(U))) + +#define _mm256_rol_epi32(A, B) \ + ((__m256i)__builtin_ia32_prold256_mask ((__v8si)(__m256i)(A), (int)(B), \ + (__v8si)(__m256i)_mm256_setzero_si256 (),\ + (__mmask8)-1)) + +#define _mm256_mask_rol_epi32(W, U, A, B) \ + ((__m256i)__builtin_ia32_prold256_mask ((__v8si)(__m256i)(A), (int)(B), \ + (__v8si)(__m256i)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_rol_epi32(U, A, B) \ + ((__m256i)__builtin_ia32_prold256_mask ((__v8si)(__m256i)(A), (int)(B), \ + (__v8si)(__m256i)_mm256_setzero_si256 (),\ + (__mmask8)(U))) + +#define _mm_rol_epi32(A, B) \ + ((__m128i)__builtin_ia32_prold128_mask ((__v4si)(__m128i)(A), (int)(B), \ + (__v4si)(__m128i)_mm_setzero_si128 (),\ + (__mmask8)-1)) + +#define _mm_mask_rol_epi32(W, U, A, B) \ + ((__m128i)__builtin_ia32_prold128_mask ((__v4si)(__m128i)(A), (int)(B), \ + (__v4si)(__m128i)(W), \ + (__mmask8)(U))) + +#define _mm_maskz_rol_epi32(U, A, B) \ + ((__m128i)__builtin_ia32_prold128_mask ((__v4si)(__m128i)(A), (int)(B), \ + (__v4si)(__m128i)_mm_setzero_si128 (),\ + (__mmask8)(U))) + +#define _mm256_ror_epi32(A, B) \ + ((__m256i)__builtin_ia32_prord256_mask ((__v8si)(__m256i)(A), (int)(B), \ + (__v8si)(__m256i)_mm256_setzero_si256 (),\ + (__mmask8)-1)) + +#define _mm256_mask_ror_epi32(W, U, A, B) \ + ((__m256i)__builtin_ia32_prord256_mask ((__v8si)(__m256i)(A), (int)(B), \ + (__v8si)(__m256i)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_ror_epi32(U, A, B) \ + ((__m256i)__builtin_ia32_prord256_mask ((__v8si)(__m256i)(A), (int)(B), \ + (__v8si)(__m256i) \ + _mm256_setzero_si256 (), \ + (__mmask8)(U))) + +#define _mm_ror_epi32(A, B) \ + ((__m128i)__builtin_ia32_prord128_mask ((__v4si)(__m128i)(A), (int)(B), \ + (__v4si)(__m128i)_mm_setzero_si128 (),\ + (__mmask8)-1)) + +#define _mm_mask_ror_epi32(W, U, A, B) \ + ((__m128i)__builtin_ia32_prord128_mask ((__v4si)(__m128i)(A), (int)(B), \ + (__v4si)(__m128i)(W), \ + (__mmask8)(U))) + +#define _mm_maskz_ror_epi32(U, A, B) \ + ((__m128i)__builtin_ia32_prord128_mask ((__v4si)(__m128i)(A), (int)(B), \ + (__v4si)(__m128i)_mm_setzero_si128 (),\ + (__mmask8)(U))) + +#define _mm256_alignr_epi32(X, Y, C) \ + ((__m256i)__builtin_ia32_alignd256_mask ((__v8si)(__m256i)(X), \ + (__v8si)(__m256i)(Y), (int)(C), (__v8si)(__m256i)(X), (__mmask8)-1)) + +#define _mm256_mask_alignr_epi32(W, U, X, Y, C) \ + ((__m256i)__builtin_ia32_alignd256_mask ((__v8si)(__m256i)(X), \ + (__v8si)(__m256i)(Y), (int)(C), (__v8si)(__m256i)(W), (__mmask8)(U))) + +#define _mm256_maskz_alignr_epi32(U, X, Y, C) \ + ((__m256i)__builtin_ia32_alignd256_mask ((__v8si)(__m256i)(X), \ + (__v8si)(__m256i)(Y), (int)(C), (__v8si)(__m256i)_mm256_setzero_si256 (),\ + (__mmask8)(U))) + +#define _mm256_alignr_epi64(X, Y, C) \ + ((__m256i)__builtin_ia32_alignq256_mask ((__v4di)(__m256i)(X), \ + (__v4di)(__m256i)(Y), (int)(C), (__v4di)(__m256i)(X), (__mmask8)-1)) + +#define _mm256_mask_alignr_epi64(W, U, X, Y, C) \ + ((__m256i)__builtin_ia32_alignq256_mask ((__v4di)(__m256i)(X), \ + (__v4di)(__m256i)(Y), (int)(C), (__v4di)(__m256i)(W), (__mmask8)(U))) + +#define _mm256_maskz_alignr_epi64(U, X, Y, C) \ + ((__m256i)__builtin_ia32_alignq256_mask ((__v4di)(__m256i)(X), \ + (__v4di)(__m256i)(Y), (int)(C), (__v4di)(__m256i)_mm256_setzero_si256 (),\ + (__mmask8)(U))) + +#define _mm_alignr_epi32(X, Y, C) \ + ((__m128i)__builtin_ia32_alignd128_mask ((__v4si)(__m128i)(X), \ + (__v4si)(__m128i)(Y), (int)(C), (__v4si)(__m128i)(X), (__mmask8)-1)) + +#define _mm_mask_alignr_epi32(W, U, X, Y, C) \ + ((__m128i)__builtin_ia32_alignd128_mask ((__v4si)(__m128i)(X), \ + (__v4si)(__m128i)(Y), (int)(C), (__v4si)(__m128i)(W), (__mmask8)(U))) + +#define _mm_maskz_alignr_epi32(U, X, Y, C) \ + ((__m128i)__builtin_ia32_alignd128_mask ((__v4si)(__m128i)(X), \ + (__v4si)(__m128i)(Y), (int)(C), (__v4si)(__m128i)_mm_setzero_si128 (),\ + (__mmask8)(U))) + +#define _mm_alignr_epi64(X, Y, C) \ + ((__m128i)__builtin_ia32_alignq128_mask ((__v2di)(__m128i)(X), \ + (__v2di)(__m128i)(Y), (int)(C), (__v2di)(__m128i)(X), (__mmask8)-1)) + +#define _mm_mask_alignr_epi64(W, U, X, Y, C) \ + ((__m128i)__builtin_ia32_alignq128_mask ((__v2di)(__m128i)(X), \ + (__v2di)(__m128i)(Y), (int)(C), (__v2di)(__m128i)(X), (__mmask8)-1)) + +#define _mm_maskz_alignr_epi64(U, X, Y, C) \ + ((__m128i)__builtin_ia32_alignq128_mask ((__v2di)(__m128i)(X), \ + (__v2di)(__m128i)(Y), (int)(C), (__v2di)(__m128i)_mm_setzero_si128 (),\ + (__mmask8)(U))) + +#define _mm_mask_cvtps_ph(W, U, A, I) \ + ((__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf)(__m128) (A), (int) (I), \ + (__v8hi)(__m128i) (W), (__mmask8) (U))) + +#define _mm_maskz_cvtps_ph(U, A, I) \ + ((__m128i) __builtin_ia32_vcvtps2ph_mask ((__v4sf)(__m128) (A), (int) (I), \ + (__v8hi)(__m128i) _mm_setzero_si128 (), (__mmask8) (U))) + +#define _mm256_mask_cvtps_ph(W, U, A, I) \ + ((__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf)(__m256) (A), (int) (I), \ + (__v8hi)(__m128i) (W), (__mmask8) (U))) + +#define _mm256_maskz_cvtps_ph(U, A, I) \ + ((__m128i) __builtin_ia32_vcvtps2ph256_mask ((__v8sf)(__m256) (A), (int) (I), \ + (__v8hi)(__m128i) _mm_setzero_si128 (), (__mmask8) (U))) + +#define _mm256_mask_srai_epi32(W, U, A, B) \ + ((__m256i) __builtin_ia32_psradi256_mask ((__v8si)(__m256i)(A), \ + (int)(B), (__v8si)(__m256i)(W), (__mmask8)(U))) + +#define _mm256_maskz_srai_epi32(U, A, B) \ + ((__m256i) __builtin_ia32_psradi256_mask ((__v8si)(__m256i)(A), \ + (int)(B), (__v8si)_mm256_setzero_si256 (), (__mmask8)(U))) + +#define _mm_mask_srai_epi32(W, U, A, B) \ + ((__m128i) __builtin_ia32_psradi128_mask ((__v4si)(__m128i)(A), \ + (int)(B), (__v4si)(__m128i)(W), (__mmask8)(U))) + +#define _mm_maskz_srai_epi32(U, A, B) \ + ((__m128i) __builtin_ia32_psradi128_mask ((__v4si)(__m128i)(A), \ + (int)(B), (__v4si)_mm_setzero_si128 (), (__mmask8)(U))) + +#define _mm256_srai_epi64(A, B) \ + ((__m256i) __builtin_ia32_psraqi256_mask ((__v4di)(__m256i)(A), \ + (int)(B), (__v4di)_mm256_setzero_si256 (), (__mmask8)-1)) + +#define _mm256_mask_srai_epi64(W, U, A, B) \ + ((__m256i) __builtin_ia32_psraqi256_mask ((__v4di)(__m256i)(A), \ + (int)(B), (__v4di)(__m256i)(W), (__mmask8)(U))) + +#define _mm256_maskz_srai_epi64(U, A, B) \ + ((__m256i) __builtin_ia32_psraqi256_mask ((__v4di)(__m256i)(A), \ + (int)(B), (__v4di)_mm256_setzero_si256 (), (__mmask8)(U))) + +#define _mm_srai_epi64(A, B) \ + ((__m128i) __builtin_ia32_psraqi128_mask ((__v2di)(__m128i)(A), \ + (int)(B), (__v2di)_mm_setzero_si128 (), (__mmask8)-1)) + +#define _mm_mask_srai_epi64(W, U, A, B) \ + ((__m128i) __builtin_ia32_psraqi128_mask ((__v2di)(__m128i)(A), \ + (int)(B), (__v2di)(__m128i)(W), (__mmask8)(U))) + +#define _mm_maskz_srai_epi64(U, A, B) \ + ((__m128i) __builtin_ia32_psraqi128_mask ((__v2di)(__m128i)(A), \ + (int)(B), (__v2di)_mm_setzero_si128 (), (__mmask8)(U))) + +#define _mm256_mask_permutex_pd(W, U, A, B) \ + ((__m256d) __builtin_ia32_permdf256_mask ((__v4df)(__m256d)(A), \ + (int)(B), (__v4df)(__m256d)(W), (__mmask8)(U))) + +#define _mm256_maskz_permutex_pd(U, A, B) \ + ((__m256d) __builtin_ia32_permdf256_mask ((__v4df)(__m256d)(A), \ + (int)(B), (__v4df)(__m256d)_mm256_setzero_pd (), (__mmask8)(U))) + +#define _mm256_mask_permute_pd(W, U, X, C) \ + ((__m256d) __builtin_ia32_vpermilpd256_mask ((__v4df)(__m256d)(X), (int)(C), \ + (__v4df)(__m256d)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_permute_pd(U, X, C) \ + ((__m256d) __builtin_ia32_vpermilpd256_mask ((__v4df)(__m256d)(X), (int)(C), \ + (__v4df)(__m256d)_mm256_setzero_pd (),\ + (__mmask8)(U))) + +#define _mm256_mask_permute_ps(W, U, X, C) \ + ((__m256) __builtin_ia32_vpermilps256_mask ((__v8sf)(__m256)(X), (int)(C), \ + (__v8sf)(__m256)(W), (__mmask8)(U))) + +#define _mm256_maskz_permute_ps(U, X, C) \ + ((__m256) __builtin_ia32_vpermilps256_mask ((__v8sf)(__m256)(X), (int)(C), \ + (__v8sf)(__m256)_mm256_setzero_ps (), \ + (__mmask8)(U))) + +#define _mm_mask_permute_pd(W, U, X, C) \ + ((__m128d) __builtin_ia32_vpermilpd_mask ((__v2df)(__m128d)(X), (int)(C), \ + (__v2df)(__m128d)(W), (__mmask8)(U))) + +#define _mm_maskz_permute_pd(U, X, C) \ + ((__m128d) __builtin_ia32_vpermilpd_mask ((__v2df)(__m128d)(X), (int)(C), \ + (__v2df)(__m128d)_mm_setzero_pd (), \ + (__mmask8)(U))) + +#define _mm_mask_permute_ps(W, U, X, C) \ + ((__m128) __builtin_ia32_vpermilps_mask ((__v4sf)(__m128)(X), (int)(C), \ + (__v4sf)(__m128)(W), (__mmask8)(U))) + +#define _mm_maskz_permute_ps(U, X, C) \ + ((__m128) __builtin_ia32_vpermilps_mask ((__v4sf)(__m128)(X), (int)(C), \ + (__v4sf)(__m128)_mm_setzero_ps (), \ + (__mmask8)(U))) + +#define _mm256_mask_blend_pd(__U, __A, __W) \ + ((__m256d) __builtin_ia32_blendmpd_256_mask ((__v4df) (__A), \ + (__v4df) (__W), \ + (__mmask8) (__U))) + +#define _mm256_mask_blend_ps(__U, __A, __W) \ + ((__m256) __builtin_ia32_blendmps_256_mask ((__v8sf) (__A), \ + (__v8sf) (__W), \ + (__mmask8) (__U))) + +#define _mm256_mask_blend_epi64(__U, __A, __W) \ + ((__m256i) __builtin_ia32_blendmq_256_mask ((__v4di) (__A), \ + (__v4di) (__W), \ + (__mmask8) (__U))) + +#define _mm256_mask_blend_epi32(__U, __A, __W) \ + ((__m256i) __builtin_ia32_blendmd_256_mask ((__v8si) (__A), \ + (__v8si) (__W), \ + (__mmask8) (__U))) + +#define _mm_mask_blend_pd(__U, __A, __W) \ + ((__m128d) __builtin_ia32_blendmpd_128_mask ((__v2df) (__A), \ + (__v2df) (__W), \ + (__mmask8) (__U))) + +#define _mm_mask_blend_ps(__U, __A, __W) \ + ((__m128) __builtin_ia32_blendmps_128_mask ((__v4sf) (__A), \ + (__v4sf) (__W), \ + (__mmask8) (__U))) + +#define _mm_mask_blend_epi64(__U, __A, __W) \ + ((__m128i) __builtin_ia32_blendmq_128_mask ((__v2di) (__A), \ + (__v2di) (__W), \ + (__mmask8) (__U))) + +#define _mm_mask_blend_epi32(__U, __A, __W) \ + ((__m128i) __builtin_ia32_blendmd_128_mask ((__v4si) (__A), \ + (__v4si) (__W), \ + (__mmask8) (__U))) + +#define _mm256_cmp_epu32_mask(X, Y, P) \ + ((__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si)(__m256i)(X), \ + (__v8si)(__m256i)(Y), (int)(P),\ + (__mmask8)-1)) + +#define _mm256_cmp_epi64_mask(X, Y, P) \ + ((__mmask8) __builtin_ia32_cmpq256_mask ((__v4di)(__m256i)(X), \ + (__v4di)(__m256i)(Y), (int)(P),\ + (__mmask8)-1)) + +#define _mm256_cmp_epi32_mask(X, Y, P) \ + ((__mmask8) __builtin_ia32_cmpd256_mask ((__v8si)(__m256i)(X), \ + (__v8si)(__m256i)(Y), (int)(P),\ + (__mmask8)-1)) + +#define _mm256_cmp_epu64_mask(X, Y, P) \ + ((__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di)(__m256i)(X), \ + (__v4di)(__m256i)(Y), (int)(P),\ + (__mmask8)-1)) + +#define _mm256_cmp_pd_mask(X, Y, P) \ + ((__mmask8) __builtin_ia32_cmppd256_mask ((__v4df)(__m256d)(X), \ + (__v4df)(__m256d)(Y), (int)(P),\ + (__mmask8)-1)) + +#define _mm256_cmp_ps_mask(X, Y, P) \ + ((__mmask8) __builtin_ia32_cmpps256_mask ((__v8sf)(__m256)(X), \ + (__v8sf)(__m256)(Y), (int)(P),\ + (__mmask8)-1)) + +#define _mm256_mask_cmp_epi64_mask(M, X, Y, P) \ + ((__mmask8) __builtin_ia32_cmpq256_mask ((__v4di)(__m256i)(X), \ + (__v4di)(__m256i)(Y), (int)(P),\ + (__mmask8)(M))) + +#define _mm256_mask_cmp_epi32_mask(M, X, Y, P) \ + ((__mmask8) __builtin_ia32_cmpd256_mask ((__v8si)(__m256i)(X), \ + (__v8si)(__m256i)(Y), (int)(P),\ + (__mmask8)(M))) + +#define _mm256_mask_cmp_epu64_mask(M, X, Y, P) \ + ((__mmask8) __builtin_ia32_ucmpq256_mask ((__v4di)(__m256i)(X), \ + (__v4di)(__m256i)(Y), (int)(P),\ + (__mmask8)(M))) + +#define _mm256_mask_cmp_epu32_mask(M, X, Y, P) \ + ((__mmask8) __builtin_ia32_ucmpd256_mask ((__v8si)(__m256i)(X), \ + (__v8si)(__m256i)(Y), (int)(P),\ + (__mmask8)(M))) + +#define _mm256_mask_cmp_pd_mask(M, X, Y, P) \ + ((__mmask8) __builtin_ia32_cmppd256_mask ((__v4df)(__m256d)(X), \ + (__v4df)(__m256d)(Y), (int)(P),\ + (__mmask8)(M))) + +#define _mm256_mask_cmp_ps_mask(M, X, Y, P) \ + ((__mmask8) __builtin_ia32_cmpps256_mask ((__v8sf)(__m256)(X), \ + (__v8sf)(__m256)(Y), (int)(P),\ + (__mmask8)(M))) + +#define _mm_cmp_epi64_mask(X, Y, P) \ + ((__mmask8) __builtin_ia32_cmpq128_mask ((__v2di)(__m128i)(X), \ + (__v2di)(__m128i)(Y), (int)(P),\ + (__mmask8)-1)) + +#define _mm_cmp_epi32_mask(X, Y, P) \ + ((__mmask8) __builtin_ia32_cmpd128_mask ((__v4si)(__m128i)(X), \ + (__v4si)(__m128i)(Y), (int)(P),\ + (__mmask8)-1)) + +#define _mm_cmp_epu64_mask(X, Y, P) \ + ((__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di)(__m128i)(X), \ + (__v2di)(__m128i)(Y), (int)(P),\ + (__mmask8)-1)) + +#define _mm_cmp_epu32_mask(X, Y, P) \ + ((__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si)(__m128i)(X), \ + (__v4si)(__m128i)(Y), (int)(P),\ + (__mmask8)-1)) + +#define _mm_cmp_pd_mask(X, Y, P) \ + ((__mmask8) __builtin_ia32_cmppd128_mask ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (int)(P),\ + (__mmask8)-1)) + +#define _mm_cmp_ps_mask(X, Y, P) \ + ((__mmask8) __builtin_ia32_cmpps128_mask ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (int)(P),\ + (__mmask8)-1)) + +#define _mm_mask_cmp_epi64_mask(M, X, Y, P) \ + ((__mmask8) __builtin_ia32_cmpq128_mask ((__v2di)(__m128i)(X), \ + (__v2di)(__m128i)(Y), (int)(P),\ + (__mmask8)(M))) + +#define _mm_mask_cmp_epi32_mask(M, X, Y, P) \ + ((__mmask8) __builtin_ia32_cmpd128_mask ((__v4si)(__m128i)(X), \ + (__v4si)(__m128i)(Y), (int)(P),\ + (__mmask8)(M))) + +#define _mm_mask_cmp_epu64_mask(M, X, Y, P) \ + ((__mmask8) __builtin_ia32_ucmpq128_mask ((__v2di)(__m128i)(X), \ + (__v2di)(__m128i)(Y), (int)(P),\ + (__mmask8)(M))) + +#define _mm_mask_cmp_epu32_mask(M, X, Y, P) \ + ((__mmask8) __builtin_ia32_ucmpd128_mask ((__v4si)(__m128i)(X), \ + (__v4si)(__m128i)(Y), (int)(P),\ + (__mmask8)(M))) + +#define _mm_mask_cmp_pd_mask(M, X, Y, P) \ + ((__mmask8) __builtin_ia32_cmppd128_mask ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (int)(P),\ + (__mmask8)(M))) + +#define _mm_mask_cmp_ps_mask(M, X, Y, P) \ + ((__mmask8) __builtin_ia32_cmpps128_mask ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (int)(P),\ + (__mmask8)(M))) + +#endif + +#define _mm256_permutexvar_ps(A, B) _mm256_permutevar8x32_ps ((B), (A)) +#define _mm256_mask_cvt_roundps_ph(A, B, C, D) \ + _mm256_mask_cvtps_ph ((A), (B), (C), (D)) +#define _mm256_maskz_cvt_roundps_ph(A, B, C) \ + _mm256_maskz_cvtps_ph ((A), (B), (C)) +#define _mm_mask_cvt_roundps_ph(A, B, C, D) \ + _mm_mask_cvtps_ph ((A), (B), (C), (D)) +#define _mm_maskz_cvt_roundps_ph(A, B, C) _mm_maskz_cvtps_ph ((A), (B), (C)) + +#ifdef __DISABLE_AVX512VL__ +#undef __DISABLE_AVX512VL__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512VL__ */ + +#endif /* _AVX512VLINTRIN_H_INCLUDED */ diff --git a/include-gcc/avx512vnniintrin.h b/include-gcc/avx512vnniintrin.h new file mode 100644 index 0000000..e36e2e5 --- /dev/null +++ b/include-gcc/avx512vnniintrin.h @@ -0,0 +1,144 @@ +/* Copyright (C) 2013-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512VNNIINTRIN_H_INCLUDED +#define __AVX512VNNIINTRIN_H_INCLUDED + +#if !defined(__AVX512VNNI__) +#pragma GCC push_options +#pragma GCC target("avx512vnni") +#define __DISABLE_AVX512VNNI__ +#endif /* __AVX512VNNI__ */ + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_dpbusd_epi32 (__m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_vpdpbusd_v16si ((__v16si)__A, (__v16si) __B, + (__v16si) __C); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_dpbusd_epi32 (__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) +{ + return (__m512i)__builtin_ia32_vpdpbusd_v16si_mask ((__v16si)__A, + (__v16si) __C, (__v16si) __D, (__mmask16)__B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_dpbusd_epi32 (__mmask16 __A, __m512i __B, __m512i __C, + __m512i __D) +{ + return (__m512i)__builtin_ia32_vpdpbusd_v16si_maskz ((__v16si)__B, + (__v16si) __C, (__v16si) __D, (__mmask16)__A); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_dpbusds_epi32 (__m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_vpdpbusds_v16si ((__v16si)__A, (__v16si) __B, + (__v16si) __C); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_dpbusds_epi32 (__m512i __A, __mmask16 __B, __m512i __C, + __m512i __D) +{ + return (__m512i)__builtin_ia32_vpdpbusds_v16si_mask ((__v16si)__A, + (__v16si) __C, (__v16si) __D, (__mmask16)__B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_dpbusds_epi32 (__mmask16 __A, __m512i __B, __m512i __C, + __m512i __D) +{ + return (__m512i)__builtin_ia32_vpdpbusds_v16si_maskz ((__v16si)__B, + (__v16si) __C, (__v16si) __D, (__mmask16)__A); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_dpwssd_epi32 (__m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_vpdpwssd_v16si ((__v16si)__A, (__v16si) __B, + (__v16si) __C); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_dpwssd_epi32 (__m512i __A, __mmask16 __B, __m512i __C, __m512i __D) +{ + return (__m512i)__builtin_ia32_vpdpwssd_v16si_mask ((__v16si)__A, + (__v16si) __C, (__v16si) __D, (__mmask16)__B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_dpwssd_epi32 (__mmask16 __A, __m512i __B, __m512i __C, + __m512i __D) +{ + return (__m512i)__builtin_ia32_vpdpwssd_v16si_maskz ((__v16si)__B, + (__v16si) __C, (__v16si) __D, (__mmask16)__A); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_dpwssds_epi32 (__m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_vpdpwssds_v16si ((__v16si)__A, (__v16si) __B, + (__v16si) __C); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_dpwssds_epi32 (__m512i __A, __mmask16 __B, __m512i __C, + __m512i __D) +{ + return (__m512i)__builtin_ia32_vpdpwssds_v16si_mask ((__v16si)__A, + (__v16si) __C, (__v16si) __D, (__mmask16)__B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_dpwssds_epi32 (__mmask16 __A, __m512i __B, __m512i __C, + __m512i __D) +{ + return (__m512i)__builtin_ia32_vpdpwssds_v16si_maskz ((__v16si)__B, + (__v16si) __C, (__v16si) __D, (__mmask16)__A); +} + +#ifdef __DISABLE_AVX512VNNI__ +#undef __DISABLE_AVX512VNNI__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512VNNI__ */ + +#endif /* __AVX512VNNIINTRIN_H_INCLUDED */ diff --git a/include-gcc/avx512vnnivlintrin.h b/include-gcc/avx512vnnivlintrin.h new file mode 100644 index 0000000..c62a6e8 --- /dev/null +++ b/include-gcc/avx512vnnivlintrin.h @@ -0,0 +1,210 @@ +/* Copyright (C) 2013-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512VNNIVLINTRIN_H_INCLUDED +#define _AVX512VNNIVLINTRIN_H_INCLUDED + +#if !defined(__AVX512VL__) || !defined(__AVX512VNNI__) +#pragma GCC push_options +#pragma GCC target("avx512vnni,avx512vl") +#define __DISABLE_AVX512VNNIVL__ +#endif /* __AVX512VNNIVL__ */ + +#define _mm256_dpbusd_epi32(A, B, C) \ + ((__m256i) __builtin_ia32_vpdpbusd_v8si ((__v8si) (A), \ + (__v8si) (B), \ + (__v8si) (C))) + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_dpbusd_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpdpbusd_v8si_mask ((__v8si)__A, (__v8si) __C, + (__v8si) __D, (__mmask8)__B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_dpbusd_epi32 (__mmask8 __A, __m256i __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpdpbusd_v8si_maskz ((__v8si)__B, + (__v8si) __C, (__v8si) __D, (__mmask8)__A); +} + +#define _mm_dpbusd_epi32(A, B, C) \ + ((__m128i) __builtin_ia32_vpdpbusd_v4si ((__v4si) (A), \ + (__v4si) (B), \ + (__v4si) (C))) + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_dpbusd_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpdpbusd_v4si_mask ((__v4si)__A, (__v4si) __C, + (__v4si) __D, (__mmask8)__B); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_dpbusd_epi32 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpdpbusd_v4si_maskz ((__v4si)__B, + (__v4si) __C, (__v4si) __D, (__mmask8)__A); +} + +#define _mm256_dpbusds_epi32(A, B, C) \ + ((__m256i) __builtin_ia32_vpdpbusds_v8si ((__v8si) (A), \ + (__v8si) (B), \ + (__v8si) (C))) + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_dpbusds_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpdpbusds_v8si_mask ((__v8si)__A, + (__v8si) __C, (__v8si) __D, (__mmask8)__B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_dpbusds_epi32 (__mmask8 __A, __m256i __B, __m256i __C, + __m256i __D) +{ + return (__m256i)__builtin_ia32_vpdpbusds_v8si_maskz ((__v8si)__B, + (__v8si) __C, (__v8si) __D, (__mmask8)__A); +} + +#define _mm_dpbusds_epi32(A, B, C) \ + ((__m128i) __builtin_ia32_vpdpbusds_v4si ((__v4si) (A), \ + (__v4si) (B), \ + (__v4si) (C))) + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_dpbusds_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpdpbusds_v4si_mask ((__v4si)__A, + (__v4si) __C, (__v4si) __D, (__mmask8)__B); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_dpbusds_epi32 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpdpbusds_v4si_maskz ((__v4si)__B, + (__v4si) __C, (__v4si) __D, (__mmask8)__A); +} + +#define _mm256_dpwssd_epi32(A, B, C) \ + ((__m256i) __builtin_ia32_vpdpwssd_v8si ((__v8si) (A), \ + (__v8si) (B), \ + (__v8si) (C))) + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_dpwssd_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpdpwssd_v8si_mask ((__v8si)__A, (__v8si) __C, + (__v8si) __D, (__mmask8)__B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_dpwssd_epi32 (__mmask8 __A, __m256i __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpdpwssd_v8si_maskz ((__v8si)__B, + (__v8si) __C, (__v8si) __D, (__mmask8)__A); +} + +#define _mm_dpwssd_epi32(A, B, C) \ + ((__m128i) __builtin_ia32_vpdpwssd_v4si ((__v4si) (A), \ + (__v4si) (B), \ + (__v4si) (C))) + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_dpwssd_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpdpwssd_v4si_mask ((__v4si)__A, (__v4si) __C, + (__v4si) __D, (__mmask8)__B); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_dpwssd_epi32 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpdpwssd_v4si_maskz ((__v4si)__B, + (__v4si) __C, (__v4si) __D, (__mmask8)__A); +} + +#define _mm256_dpwssds_epi32(A, B, C) \ + ((__m256i) __builtin_ia32_vpdpwssds_v8si ((__v8si) (A), \ + (__v8si) (B), \ + (__v8si) (C))) + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_dpwssds_epi32 (__m256i __A, __mmask8 __B, __m256i __C, __m256i __D) +{ + return (__m256i)__builtin_ia32_vpdpwssds_v8si_mask ((__v8si)__A, + (__v8si) __C, (__v8si) __D, (__mmask8)__B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_dpwssds_epi32 (__mmask8 __A, __m256i __B, __m256i __C, + __m256i __D) +{ + return (__m256i)__builtin_ia32_vpdpwssds_v8si_maskz ((__v8si)__B, + (__v8si) __C, (__v8si) __D, (__mmask8)__A); +} + +#define _mm_dpwssds_epi32(A, B, C) \ + ((__m128i) __builtin_ia32_vpdpwssds_v4si ((__v4si) (A), \ + (__v4si) (B), \ + (__v4si) (C))) + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_dpwssds_epi32 (__m128i __A, __mmask8 __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpdpwssds_v4si_mask ((__v4si)__A, + (__v4si) __C, (__v4si) __D, (__mmask8)__B); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_dpwssds_epi32 (__mmask8 __A, __m128i __B, __m128i __C, __m128i __D) +{ + return (__m128i)__builtin_ia32_vpdpwssds_v4si_maskz ((__v4si)__B, + (__v4si) __C, (__v4si) __D, (__mmask8)__A); +} +#ifdef __DISABLE_AVX512VNNIVL__ +#undef __DISABLE_AVX512VNNIVL__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512VNNIVL__ */ +#endif /* __DISABLE_AVX512VNNIVL__ */ diff --git a/include-gcc/avx512vp2intersectintrin.h b/include-gcc/avx512vp2intersectintrin.h new file mode 100644 index 0000000..65e2fb1 --- /dev/null +++ b/include-gcc/avx512vp2intersectintrin.h @@ -0,0 +1,58 @@ +/* Copyright (C) 2019-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512VP2INTERSECTINTRIN_H_INCLUDED +#define _AVX512VP2INTERSECTINTRIN_H_INCLUDED + +#if !defined(__AVX512VP2INTERSECT__) +#pragma GCC push_options +#pragma GCC target("avx512vp2intersect") +#define __DISABLE_AVX512VP2INTERSECT__ +#endif /* __AVX512VP2INTERSECT__ */ + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_2intersect_epi32 (__m512i __A, __m512i __B, __mmask16 *__U, + __mmask16 *__M) +{ + __builtin_ia32_2intersectd512 (__U, __M, (__v16si) __A, (__v16si) __B); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_2intersect_epi64 (__m512i __A, __m512i __B, __mmask8 *__U, + __mmask8 *__M) +{ + __builtin_ia32_2intersectq512 (__U, __M, (__v8di) __A, (__v8di) __B); +} + +#ifdef __DISABLE_AVX512VP2INTERSECT__ +#undef __DISABLE_AVX512VP2INTERSECT__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512VP2INTERSECT__ */ + +#endif /* _AVX512VP2INTERSECTINTRIN_H_INCLUDED */ diff --git a/include-gcc/avx512vp2intersectvlintrin.h b/include-gcc/avx512vp2intersectvlintrin.h new file mode 100644 index 0000000..ce68aee --- /dev/null +++ b/include-gcc/avx512vp2intersectvlintrin.h @@ -0,0 +1,72 @@ +/* Copyright (C) 2019-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512VP2INTERSECTVLINTRIN_H_INCLUDED +#define _AVX512VP2INTERSECTVLINTRIN_H_INCLUDED + +#if !defined(__AVX512VP2INTERSECT__) || !defined(__AVX512VL__) +#pragma GCC push_options +#pragma GCC target("avx512vp2intersect,avx512vl") +#define __DISABLE_AVX512VP2INTERSECTVL__ +#endif /* __AVX512VP2INTERSECTVL__ */ + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_2intersect_epi32 (__m128i __A, __m128i __B, __mmask8 *__U, __mmask8 *__M) +{ + __builtin_ia32_2intersectd128 (__U, __M, (__v4si) __A, (__v4si) __B); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_2intersect_epi32 (__m256i __A, __m256i __B, __mmask8 *__U, + __mmask8 *__M) +{ + __builtin_ia32_2intersectd256 (__U, __M, (__v8si) __A, (__v8si) __B); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_2intersect_epi64 (__m128i __A, __m128i __B, __mmask8 *__U, __mmask8 *__M) +{ + __builtin_ia32_2intersectq128 (__U, __M, (__v2di) __A, (__v2di) __B); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_2intersect_epi64 (__m256i __A, __m256i __B, __mmask8 *__U, + __mmask8 *__M) +{ + __builtin_ia32_2intersectq256 (__U, __M, (__v4di) __A, (__v4di) __B); +} + +#ifdef __DISABLE_AVX512VP2INTERSECTVL__ +#undef __DISABLE_AVX512VP2INTERSECTVL__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512VP2INTERSECTVL__ */ + +#endif /* _AVX512VP2INTERSECTVLINTRIN_H_INCLUDED */ diff --git a/include-gcc/avx512vpopcntdqintrin.h b/include-gcc/avx512vpopcntdqintrin.h new file mode 100644 index 0000000..47897fb --- /dev/null +++ b/include-gcc/avx512vpopcntdqintrin.h @@ -0,0 +1,94 @@ +/* Copyright (C) 2017-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#if !defined _IMMINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _AVX512VPOPCNTDQINTRIN_H_INCLUDED +#define _AVX512VPOPCNTDQINTRIN_H_INCLUDED + +#ifndef __AVX512VPOPCNTDQ__ +#pragma GCC push_options +#pragma GCC target("avx512vpopcntdq") +#define __DISABLE_AVX512VPOPCNTDQ__ +#endif /* __AVX512VPOPCNTDQ__ */ + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_popcnt_epi32 (__m512i __A) +{ + return (__m512i) __builtin_ia32_vpopcountd_v16si ((__v16si) __A); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_popcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_vpopcountd_v16si_mask ((__v16si) __A, + (__v16si) __W, + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_popcnt_epi32 (__mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_vpopcountd_v16si_mask ((__v16si) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_popcnt_epi64 (__m512i __A) +{ + return (__m512i) __builtin_ia32_vpopcountq_v8di ((__v8di) __A); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_popcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_vpopcountq_v8di_mask ((__v8di) __A, + (__v8di) __W, + (__mmask8) __U); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_popcnt_epi64 (__mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_vpopcountq_v8di_mask ((__v8di) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +#ifdef __DISABLE_AVX512VPOPCNTDQ__ +#undef __DISABLE_AVX512VPOPCNTDQ__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512VPOPCNTDQ__ */ + +#endif /* _AVX512VPOPCNTDQINTRIN_H_INCLUDED */ diff --git a/include-gcc/avx512vpopcntdqvlintrin.h b/include-gcc/avx512vpopcntdqvlintrin.h new file mode 100644 index 0000000..972ab3b --- /dev/null +++ b/include-gcc/avx512vpopcntdqvlintrin.h @@ -0,0 +1,146 @@ +/* Copyright (C) 2017-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#if !defined _IMMINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _AVX512VPOPCNTDQVLINTRIN_H_INCLUDED +#define _AVX512VPOPCNTDQVLINTRIN_H_INCLUDED + +#if !defined(__AVX512VPOPCNTDQ__) || !defined(__AVX512VL__) +#pragma GCC push_options +#pragma GCC target("avx512vpopcntdq,avx512vl") +#define __DISABLE_AVX512VPOPCNTDQVL__ +#endif /* __AVX512VPOPCNTDQVL__ */ + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_popcnt_epi32 (__m128i __A) +{ + return (__m128i) __builtin_ia32_vpopcountd_v4si ((__v4si) __A); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_popcnt_epi32 (__m128i __W, __mmask16 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vpopcountd_v4si_mask ((__v4si) __A, + (__v4si) __W, + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_popcnt_epi32 (__mmask16 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vpopcountd_v4si_mask ((__v4si) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_popcnt_epi32 (__m256i __A) +{ + return (__m256i) __builtin_ia32_vpopcountd_v8si ((__v8si) __A); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_popcnt_epi32 (__m256i __W, __mmask16 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vpopcountd_v8si_mask ((__v8si) __A, + (__v8si) __W, + (__mmask16) __U); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_popcnt_epi32 (__mmask16 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vpopcountd_v8si_mask ((__v8si) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_popcnt_epi64 (__m128i __A) +{ + return (__m128i) __builtin_ia32_vpopcountq_v2di ((__v2di) __A); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_popcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vpopcountq_v2di_mask ((__v2di) __A, + (__v2di) __W, + (__mmask8) __U); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_popcnt_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_vpopcountq_v2di_mask ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_popcnt_epi64 (__m256i __A) +{ + return (__m256i) __builtin_ia32_vpopcountq_v4di ((__v4di) __A); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_popcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vpopcountq_v4di_mask ((__v4di) __A, + (__v4di) __W, + (__mmask8) __U); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_popcnt_epi64 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_vpopcountq_v4di_mask ((__v4di) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +#ifdef __DISABLE_AVX512VPOPCNTDQVL__ +#undef __DISABLE_AVX512VPOPCNTDQVL__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX512VPOPCNTDQVL__ */ + +#endif /* _AVX512VPOPCNTDQVLINTRIN_H_INCLUDED */ diff --git a/include-gcc/avxifmaintrin.h b/include-gcc/avxifmaintrin.h new file mode 100644 index 0000000..076cc9f --- /dev/null +++ b/include-gcc/avxifmaintrin.h @@ -0,0 +1,78 @@ +/* Copyright (C) 2020-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVXIFMAINTRIN_H_INCLUDED +#define _AVXIFMAINTRIN_H_INCLUDED + +#ifndef __AVXIFMA__ +#pragma GCC push_options +#pragma GCC target("avxifma") +#define __DISABLE_AVXIFMA__ +#endif /* __AVXIFMA__ */ + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_madd52lo_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z) +{ + return (__m128i) __builtin_ia32_vpmadd52luq128 ((__v2di) __X, + (__v2di) __Y, + (__v2di) __Z); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_madd52hi_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z) +{ + return (__m128i) __builtin_ia32_vpmadd52huq128 ((__v2di) __X, + (__v2di) __Y, + (__v2di) __Z); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_madd52lo_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z) +{ + return (__m256i) __builtin_ia32_vpmadd52luq256 ((__v4di) __X, + (__v4di) __Y, + (__v4di) __Z); +} + +extern __inline __m256i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_madd52hi_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z) +{ + return (__m256i) __builtin_ia32_vpmadd52huq256 ((__v4di) __X, + (__v4di) __Y, + (__v4di) __Z); +} + +#ifdef __DISABLE_AVXIFMA__ +#undef __DISABLE_AVXIFMA__ +#pragma GCC pop_options +#endif /* __DISABLE_AVXIFMA__ */ + +#endif /* _AVXIFMAINTRIN_H_INCLUDED */ diff --git a/include-gcc/avxintrin.h b/include-gcc/avxintrin.h new file mode 100644 index 0000000..a4166bf --- /dev/null +++ b/include-gcc/avxintrin.h @@ -0,0 +1,1607 @@ +/* Copyright (C) 2008-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* Implemented from the specification included in the Intel C++ Compiler + User Guide and Reference, version 11.0. */ + +#ifndef _IMMINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _AVXINTRIN_H_INCLUDED +#define _AVXINTRIN_H_INCLUDED + +#ifndef __AVX__ +#pragma GCC push_options +#pragma GCC target("avx") +#define __DISABLE_AVX__ +#endif /* __AVX__ */ + +/* Internal data types for implementing the intrinsics. */ +typedef double __v4df __attribute__ ((__vector_size__ (32))); +typedef float __v8sf __attribute__ ((__vector_size__ (32))); +typedef long long __v4di __attribute__ ((__vector_size__ (32))); +typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32))); +typedef int __v8si __attribute__ ((__vector_size__ (32))); +typedef unsigned int __v8su __attribute__ ((__vector_size__ (32))); +typedef short __v16hi __attribute__ ((__vector_size__ (32))); +typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32))); +typedef char __v32qi __attribute__ ((__vector_size__ (32))); +typedef signed char __v32qs __attribute__ ((__vector_size__ (32))); +typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32))); + +/* The Intel API is flexible enough that we must allow aliasing with other + vector types, and their scalar components. */ +typedef float __m256 __attribute__ ((__vector_size__ (32), + __may_alias__)); +typedef long long __m256i __attribute__ ((__vector_size__ (32), + __may_alias__)); +typedef double __m256d __attribute__ ((__vector_size__ (32), + __may_alias__)); + +/* Unaligned version of the same types. */ +typedef float __m256_u __attribute__ ((__vector_size__ (32), + __may_alias__, + __aligned__ (1))); +typedef long long __m256i_u __attribute__ ((__vector_size__ (32), + __may_alias__, + __aligned__ (1))); +typedef double __m256d_u __attribute__ ((__vector_size__ (32), + __may_alias__, + __aligned__ (1))); + +/* Compare predicates for scalar and packed compare intrinsics. */ + +/* Equal (ordered, non-signaling) */ +#define _CMP_EQ_OQ 0x00 +/* Less-than (ordered, signaling) */ +#define _CMP_LT_OS 0x01 +/* Less-than-or-equal (ordered, signaling) */ +#define _CMP_LE_OS 0x02 +/* Unordered (non-signaling) */ +#define _CMP_UNORD_Q 0x03 +/* Not-equal (unordered, non-signaling) */ +#define _CMP_NEQ_UQ 0x04 +/* Not-less-than (unordered, signaling) */ +#define _CMP_NLT_US 0x05 +/* Not-less-than-or-equal (unordered, signaling) */ +#define _CMP_NLE_US 0x06 +/* Ordered (nonsignaling) */ +#define _CMP_ORD_Q 0x07 +/* Equal (unordered, non-signaling) */ +#define _CMP_EQ_UQ 0x08 +/* Not-greater-than-or-equal (unordered, signaling) */ +#define _CMP_NGE_US 0x09 +/* Not-greater-than (unordered, signaling) */ +#define _CMP_NGT_US 0x0a +/* False (ordered, non-signaling) */ +#define _CMP_FALSE_OQ 0x0b +/* Not-equal (ordered, non-signaling) */ +#define _CMP_NEQ_OQ 0x0c +/* Greater-than-or-equal (ordered, signaling) */ +#define _CMP_GE_OS 0x0d +/* Greater-than (ordered, signaling) */ +#define _CMP_GT_OS 0x0e +/* True (unordered, non-signaling) */ +#define _CMP_TRUE_UQ 0x0f +/* Equal (ordered, signaling) */ +#define _CMP_EQ_OS 0x10 +/* Less-than (ordered, non-signaling) */ +#define _CMP_LT_OQ 0x11 +/* Less-than-or-equal (ordered, non-signaling) */ +#define _CMP_LE_OQ 0x12 +/* Unordered (signaling) */ +#define _CMP_UNORD_S 0x13 +/* Not-equal (unordered, signaling) */ +#define _CMP_NEQ_US 0x14 +/* Not-less-than (unordered, non-signaling) */ +#define _CMP_NLT_UQ 0x15 +/* Not-less-than-or-equal (unordered, non-signaling) */ +#define _CMP_NLE_UQ 0x16 +/* Ordered (signaling) */ +#define _CMP_ORD_S 0x17 +/* Equal (unordered, signaling) */ +#define _CMP_EQ_US 0x18 +/* Not-greater-than-or-equal (unordered, non-signaling) */ +#define _CMP_NGE_UQ 0x19 +/* Not-greater-than (unordered, non-signaling) */ +#define _CMP_NGT_UQ 0x1a +/* False (ordered, signaling) */ +#define _CMP_FALSE_OS 0x1b +/* Not-equal (ordered, signaling) */ +#define _CMP_NEQ_OS 0x1c +/* Greater-than-or-equal (ordered, non-signaling) */ +#define _CMP_GE_OQ 0x1d +/* Greater-than (ordered, non-signaling) */ +#define _CMP_GT_OQ 0x1e +/* True (unordered, signaling) */ +#define _CMP_TRUE_US 0x1f + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_add_pd (__m256d __A, __m256d __B) +{ + return (__m256d) ((__v4df)__A + (__v4df)__B); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_add_ps (__m256 __A, __m256 __B) +{ + return (__m256) ((__v8sf)__A + (__v8sf)__B); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_addsub_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_addsub_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B); +} + + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_and_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_and_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_andnot_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_andnot_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B); +} + +/* Double/single precision floating point blend instructions - select + data from 2 sources using constant/variable mask. */ + +#ifdef __OPTIMIZE__ +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_blend_pd (__m256d __X, __m256d __Y, const int __M) +{ + return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X, + (__v4df)__Y, + __M); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_blend_ps (__m256 __X, __m256 __Y, const int __M) +{ + return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X, + (__v8sf)__Y, + __M); +} +#else +#define _mm256_blend_pd(X, Y, M) \ + ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X), \ + (__v4df)(__m256d)(Y), (int)(M))) + +#define _mm256_blend_ps(X, Y, M) \ + ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X), \ + (__v8sf)(__m256)(Y), (int)(M))) +#endif + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M) +{ + return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X, + (__v4df)__Y, + (__v4df)__M); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M) +{ + return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X, + (__v8sf)__Y, + (__v8sf)__M); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_div_pd (__m256d __A, __m256d __B) +{ + return (__m256d) ((__v4df)__A / (__v4df)__B); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_div_ps (__m256 __A, __m256 __B) +{ + return (__m256) ((__v8sf)__A / (__v8sf)__B); +} + +/* Dot product instructions with mask-defined summing and zeroing parts + of result. */ + +#ifdef __OPTIMIZE__ +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_dp_ps (__m256 __X, __m256 __Y, const int __M) +{ + return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X, + (__v8sf)__Y, + __M); +} +#else +#define _mm256_dp_ps(X, Y, M) \ + ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X), \ + (__v8sf)(__m256)(Y), (int)(M))) +#endif + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hadd_pd (__m256d __X, __m256d __Y) +{ + return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hadd_ps (__m256 __X, __m256 __Y) +{ + return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hsub_pd (__m256d __X, __m256d __Y) +{ + return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_hsub_ps (__m256 __X, __m256 __Y) +{ + return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_max_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_min_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mul_pd (__m256d __A, __m256d __B) +{ + return (__m256d) ((__v4df)__A * (__v4df)__B); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mul_ps (__m256 __A, __m256 __B) +{ + return (__m256) ((__v8sf)__A * (__v8sf)__B); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_or_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_or_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask) +{ + return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B, + __mask); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask) +{ + return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B, + __mask); +} +#else +#define _mm256_shuffle_pd(A, B, N) \ + ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A), \ + (__v4df)(__m256d)(B), (int)(N))) + +#define _mm256_shuffle_ps(A, B, N) \ + ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A), \ + (__v8sf)(__m256)(B), (int)(N))) +#endif + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sub_pd (__m256d __A, __m256d __B) +{ + return (__m256d) ((__v4df)__A - (__v4df)__B); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sub_ps (__m256 __A, __m256 __B) +{ + return (__m256) ((__v8sf)__A - (__v8sf)__B); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_xor_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_xor_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_pd (__m128d __X, __m128d __Y, const int __P) +{ + return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_ps (__m128 __X, __m128 __Y, const int __P) +{ + return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P) +{ + return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y, + __P); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P) +{ + return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y, + __P); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_sd (__m128d __X, __m128d __Y, const int __P) +{ + return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmp_ss (__m128 __X, __m128 __Y, const int __P) +{ + return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P); +} +#else +#define _mm_cmp_pd(X, Y, P) \ + ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (int)(P))) + +#define _mm_cmp_ps(X, Y, P) \ + ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (int)(P))) + +#define _mm256_cmp_pd(X, Y, P) \ + ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X), \ + (__v4df)(__m256d)(Y), (int)(P))) + +#define _mm256_cmp_ps(X, Y, P) \ + ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X), \ + (__v8sf)(__m256)(Y), (int)(P))) + +#define _mm_cmp_sd(X, Y, P) \ + ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (int)(P))) + +#define _mm_cmp_ss(X, Y, P) \ + ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (int)(P))) +#endif + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtsi256_si32 (__m256i __A) +{ + __v8si __B = (__v8si) __A; + return __B[0]; +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi32_pd (__m128i __A) +{ + return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtepi32_ps (__m256i __A) +{ + return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtpd_ps (__m256d __A) +{ + return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtps_epi32 (__m256 __A) +{ + return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtps_pd (__m128 __A) +{ + return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvttpd_epi32 (__m256d __A) +{ + return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtpd_epi32 (__m256d __A) +{ + return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvttps_epi32 (__m256 __A) +{ + return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A); +} + +extern __inline double +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtsd_f64 (__m256d __A) +{ + return __A[0]; +} + +extern __inline float +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtss_f32 (__m256 __A) +{ + return __A[0]; +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extractf128_pd (__m256d __X, const int __N) +{ + return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extractf128_ps (__m256 __X, const int __N) +{ + return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extractf128_si256 (__m256i __X, const int __N) +{ + return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extract_epi32 (__m256i __X, int const __N) +{ + __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2); + return _mm_extract_epi32 (__Y, __N % 4); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extract_epi16 (__m256i __X, int const __N) +{ + __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3); + return _mm_extract_epi16 (__Y, __N % 8); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extract_epi8 (__m256i __X, int const __N) +{ + __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4); + return _mm_extract_epi8 (__Y, __N % 16); +} + +#ifdef __x86_64__ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_extract_epi64 (__m256i __X, const int __N) +{ + __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1); + return _mm_extract_epi64 (__Y, __N % 2); +} +#endif +#else +#define _mm256_extractf128_pd(X, N) \ + ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X), \ + (int)(N))) + +#define _mm256_extractf128_ps(X, N) \ + ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X), \ + (int)(N))) + +#define _mm256_extractf128_si256(X, N) \ + ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X), \ + (int)(N))) + +#define _mm256_extract_epi32(X, N) \ + (__extension__ \ + ({ \ + __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \ + _mm_extract_epi32 (__Y, (N) % 4); \ + })) + +#define _mm256_extract_epi16(X, N) \ + (__extension__ \ + ({ \ + __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \ + _mm_extract_epi16 (__Y, (N) % 8); \ + })) + +#define _mm256_extract_epi8(X, N) \ + (__extension__ \ + ({ \ + __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \ + _mm_extract_epi8 (__Y, (N) % 16); \ + })) + +#ifdef __x86_64__ +#define _mm256_extract_epi64(X, N) \ + (__extension__ \ + ({ \ + __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \ + _mm_extract_epi64 (__Y, (N) % 2); \ + })) +#endif +#endif + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_zeroall (void) +{ + __builtin_ia32_vzeroall (); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_zeroupper (void) +{ + __builtin_ia32_vzeroupper (); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permutevar_pd (__m128d __A, __m128i __C) +{ + return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A, + (__v2di)__C); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutevar_pd (__m256d __A, __m256i __C) +{ + return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A, + (__v4di)__C); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permutevar_ps (__m128 __A, __m128i __C) +{ + return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A, + (__v4si)__C); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permutevar_ps (__m256 __A, __m256i __C) +{ + return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A, + (__v8si)__C); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permute_pd (__m128d __X, const int __C) +{ + return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute_pd (__m256d __X, const int __C) +{ + return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permute_ps (__m128 __X, const int __C) +{ + return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute_ps (__m256 __X, const int __C) +{ + return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C); +} +#else +#define _mm_permute_pd(X, C) \ + ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C))) + +#define _mm256_permute_pd(X, C) \ + ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X), (int)(C))) + +#define _mm_permute_ps(X, C) \ + ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C))) + +#define _mm256_permute_ps(X, C) \ + ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C))) +#endif + +#ifdef __OPTIMIZE__ +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C) +{ + return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X, + (__v4df)__Y, + __C); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C) +{ + return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X, + (__v8sf)__Y, + __C); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C) +{ + return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X, + (__v8si)__Y, + __C); +} +#else +#define _mm256_permute2f128_pd(X, Y, C) \ + ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X), \ + (__v4df)(__m256d)(Y), \ + (int)(C))) + +#define _mm256_permute2f128_ps(X, Y, C) \ + ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X), \ + (__v8sf)(__m256)(Y), \ + (int)(C))) + +#define _mm256_permute2f128_si256(X, Y, C) \ + ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X), \ + (__v8si)(__m256i)(Y), \ + (int)(C))) +#endif + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_broadcast_ss (float const *__X) +{ + return (__m128) __builtin_ia32_vbroadcastss (__X); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcast_sd (double const *__X) +{ + return (__m256d) __builtin_ia32_vbroadcastsd256 (__X); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcast_ss (float const *__X) +{ + return (__m256) __builtin_ia32_vbroadcastss256 (__X); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcast_pd (__m128d const *__X) +{ + return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_broadcast_ps (__m128 const *__X) +{ + return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O) +{ + return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X, + (__v2df)__Y, + __O); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O) +{ + return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X, + (__v4sf)__Y, + __O); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O) +{ + return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X, + (__v4si)__Y, + __O); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insert_epi32 (__m256i __X, int __D, int const __N) +{ + __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2); + __Y = _mm_insert_epi32 (__Y, __D, __N % 4); + return _mm256_insertf128_si256 (__X, __Y, __N >> 2); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insert_epi16 (__m256i __X, int __D, int const __N) +{ + __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3); + __Y = _mm_insert_epi16 (__Y, __D, __N % 8); + return _mm256_insertf128_si256 (__X, __Y, __N >> 3); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insert_epi8 (__m256i __X, int __D, int const __N) +{ + __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4); + __Y = _mm_insert_epi8 (__Y, __D, __N % 16); + return _mm256_insertf128_si256 (__X, __Y, __N >> 4); +} + +#ifdef __x86_64__ +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_insert_epi64 (__m256i __X, long long __D, int const __N) +{ + __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1); + __Y = _mm_insert_epi64 (__Y, __D, __N % 2); + return _mm256_insertf128_si256 (__X, __Y, __N >> 1); +} +#endif +#else +#define _mm256_insertf128_pd(X, Y, O) \ + ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X), \ + (__v2df)(__m128d)(Y), \ + (int)(O))) + +#define _mm256_insertf128_ps(X, Y, O) \ + ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X), \ + (__v4sf)(__m128)(Y), \ + (int)(O))) + +#define _mm256_insertf128_si256(X, Y, O) \ + ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X), \ + (__v4si)(__m128i)(Y), \ + (int)(O))) + +#define _mm256_insert_epi32(X, D, N) \ + (__extension__ \ + ({ \ + __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2); \ + __Y = _mm_insert_epi32 (__Y, (D), (N) % 4); \ + _mm256_insertf128_si256 ((X), __Y, (N) >> 2); \ + })) + +#define _mm256_insert_epi16(X, D, N) \ + (__extension__ \ + ({ \ + __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3); \ + __Y = _mm_insert_epi16 (__Y, (D), (N) % 8); \ + _mm256_insertf128_si256 ((X), __Y, (N) >> 3); \ + })) + +#define _mm256_insert_epi8(X, D, N) \ + (__extension__ \ + ({ \ + __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4); \ + __Y = _mm_insert_epi8 (__Y, (D), (N) % 16); \ + _mm256_insertf128_si256 ((X), __Y, (N) >> 4); \ + })) + +#ifdef __x86_64__ +#define _mm256_insert_epi64(X, D, N) \ + (__extension__ \ + ({ \ + __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1); \ + __Y = _mm_insert_epi64 (__Y, (D), (N) % 2); \ + _mm256_insertf128_si256 ((X), __Y, (N) >> 1); \ + })) +#endif +#endif + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_load_pd (double const *__P) +{ + return *(__m256d *)__P; +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_store_pd (double *__P, __m256d __A) +{ + *(__m256d *)__P = __A; +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_load_ps (float const *__P) +{ + return *(__m256 *)__P; +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_store_ps (float *__P, __m256 __A) +{ + *(__m256 *)__P = __A; +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu_pd (double const *__P) +{ + return *(__m256d_u *)__P; +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_storeu_pd (double *__P, __m256d __A) +{ + *(__m256d_u *)__P = __A; +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu_ps (float const *__P) +{ + return *(__m256_u *)__P; +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_storeu_ps (float *__P, __m256 __A) +{ + *(__m256_u *)__P = __A; +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_load_si256 (__m256i const *__P) +{ + return *__P; +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_store_si256 (__m256i *__P, __m256i __A) +{ + *__P = __A; +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu_si256 (__m256i_u const *__P) +{ + return *__P; +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_storeu_si256 (__m256i_u *__P, __m256i __A) +{ + *__P = __A; +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskload_pd (double const *__P, __m128i __M) +{ + return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P, + (__v2di)__M); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskstore_pd (double *__P, __m128i __M, __m128d __A) +{ + __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2di)__M, (__v2df)__A); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskload_pd (double const *__P, __m256i __M) +{ + return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P, + (__v4di)__M); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskstore_pd (double *__P, __m256i __M, __m256d __A) +{ + __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4di)__M, (__v4df)__A); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskload_ps (float const *__P, __m128i __M) +{ + return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P, + (__v4si)__M); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskstore_ps (float *__P, __m128i __M, __m128 __A) +{ + __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4si)__M, (__v4sf)__A); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskload_ps (float const *__P, __m256i __M) +{ + return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P, + (__v8si)__M); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskstore_ps (float *__P, __m256i __M, __m256 __A) +{ + __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8si)__M, (__v8sf)__A); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_movehdup_ps (__m256 __X) +{ + return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_moveldup_ps (__m256 __X) +{ + return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_movedup_pd (__m256d __X) +{ + return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_lddqu_si256 (__m256i const *__P) +{ + return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_stream_si256 (__m256i *__A, __m256i __B) +{ + __builtin_ia32_movntdq256 ((__v4di *)__A, (__v4di)__B); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_stream_pd (double *__A, __m256d __B) +{ + __builtin_ia32_movntpd256 (__A, (__v4df)__B); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_stream_ps (float *__P, __m256 __A) +{ + __builtin_ia32_movntps256 (__P, (__v8sf)__A); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rcp_ps (__m256 __A) +{ + return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_rsqrt_ps (__m256 __A) +{ + return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sqrt_pd (__m256d __A) +{ + return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_sqrt_ps (__m256 __A) +{ + return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_round_pd (__m256d __V, const int __M) +{ + return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_round_ps (__m256 __V, const int __M) +{ + return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M); +} +#else +#define _mm256_round_pd(V, M) \ + ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M))) + +#define _mm256_round_ps(V, M) \ + ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M))) +#endif + +#define _mm256_ceil_pd(V) _mm256_round_pd ((V), _MM_FROUND_CEIL) +#define _mm256_floor_pd(V) _mm256_round_pd ((V), _MM_FROUND_FLOOR) +#define _mm256_ceil_ps(V) _mm256_round_ps ((V), _MM_FROUND_CEIL) +#define _mm256_floor_ps(V) _mm256_round_ps ((V), _MM_FROUND_FLOOR) + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpackhi_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpacklo_pd (__m256d __A, __m256d __B) +{ + return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpackhi_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_unpacklo_ps (__m256 __A, __m256 __B) +{ + return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testz_pd (__m128d __M, __m128d __V) +{ + return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testc_pd (__m128d __M, __m128d __V) +{ + return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testnzc_pd (__m128d __M, __m128d __V) +{ + return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testz_ps (__m128 __M, __m128 __V) +{ + return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testc_ps (__m128 __M, __m128 __V) +{ + return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testnzc_ps (__m128 __M, __m128 __V) +{ + return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testz_pd (__m256d __M, __m256d __V) +{ + return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testc_pd (__m256d __M, __m256d __V) +{ + return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testnzc_pd (__m256d __M, __m256d __V) +{ + return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testz_ps (__m256 __M, __m256 __V) +{ + return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testc_ps (__m256 __M, __m256 __V) +{ + return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testnzc_ps (__m256 __M, __m256 __V) +{ + return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testz_si256 (__m256i __M, __m256i __V) +{ + return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testc_si256 (__m256i __M, __m256i __V) +{ + return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_testnzc_si256 (__m256i __M, __m256i __V) +{ + return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_movemask_pd (__m256d __A) +{ + return __builtin_ia32_movmskpd256 ((__v4df)__A); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_movemask_ps (__m256 __A) +{ + return __builtin_ia32_movmskps256 ((__v8sf)__A); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_undefined_pd (void) +{ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Winit-self" + __m256d __Y = __Y; +#pragma GCC diagnostic pop + return __Y; +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_undefined_ps (void) +{ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Winit-self" + __m256 __Y = __Y; +#pragma GCC diagnostic pop + return __Y; +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_undefined_si256 (void) +{ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Winit-self" + __m256i __Y = __Y; +#pragma GCC diagnostic pop + return __Y; +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setzero_pd (void) +{ + return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 }; +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setzero_ps (void) +{ + return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0 }; +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setzero_si256 (void) +{ + return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 }; +} + +/* Create the vector [A B C D]. */ +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set_pd (double __A, double __B, double __C, double __D) +{ + return __extension__ (__m256d){ __D, __C, __B, __A }; +} + +/* Create the vector [A B C D E F G H]. */ +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set_ps (float __A, float __B, float __C, float __D, + float __E, float __F, float __G, float __H) +{ + return __extension__ (__m256){ __H, __G, __F, __E, + __D, __C, __B, __A }; +} + +/* Create the vector [A B C D E F G H]. */ +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set_epi32 (int __A, int __B, int __C, int __D, + int __E, int __F, int __G, int __H) +{ + return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E, + __D, __C, __B, __A }; +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12, + short __q11, short __q10, short __q09, short __q08, + short __q07, short __q06, short __q05, short __q04, + short __q03, short __q02, short __q01, short __q00) +{ + return __extension__ (__m256i)(__v16hi){ + __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, + __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15 + }; +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set_epi8 (char __q31, char __q30, char __q29, char __q28, + char __q27, char __q26, char __q25, char __q24, + char __q23, char __q22, char __q21, char __q20, + char __q19, char __q18, char __q17, char __q16, + char __q15, char __q14, char __q13, char __q12, + char __q11, char __q10, char __q09, char __q08, + char __q07, char __q06, char __q05, char __q04, + char __q03, char __q02, char __q01, char __q00) +{ + return __extension__ (__m256i)(__v32qi){ + __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, + __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15, + __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23, + __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31 + }; +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set_epi64x (long long __A, long long __B, long long __C, + long long __D) +{ + return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A }; +} + +/* Create a vector with all elements equal to A. */ +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set1_pd (double __A) +{ + return __extension__ (__m256d){ __A, __A, __A, __A }; +} + +/* Create a vector with all elements equal to A. */ +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set1_ps (float __A) +{ + return __extension__ (__m256){ __A, __A, __A, __A, + __A, __A, __A, __A }; +} + +/* Create a vector with all elements equal to A. */ +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set1_epi32 (int __A) +{ + return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A, + __A, __A, __A, __A }; +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set1_epi16 (short __A) +{ + return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set1_epi8 (char __A) +{ + return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set1_epi64x (long long __A) +{ + return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A }; +} + +/* Create vectors of elements in the reversed order from the + _mm256_set_XXX functions. */ + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setr_pd (double __A, double __B, double __C, double __D) +{ + return _mm256_set_pd (__D, __C, __B, __A); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setr_ps (float __A, float __B, float __C, float __D, + float __E, float __F, float __G, float __H) +{ + return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setr_epi32 (int __A, int __B, int __C, int __D, + int __E, int __F, int __G, int __H) +{ + return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12, + short __q11, short __q10, short __q09, short __q08, + short __q07, short __q06, short __q05, short __q04, + short __q03, short __q02, short __q01, short __q00) +{ + return _mm256_set_epi16 (__q00, __q01, __q02, __q03, + __q04, __q05, __q06, __q07, + __q08, __q09, __q10, __q11, + __q12, __q13, __q14, __q15); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setr_epi8 (char __q31, char __q30, char __q29, char __q28, + char __q27, char __q26, char __q25, char __q24, + char __q23, char __q22, char __q21, char __q20, + char __q19, char __q18, char __q17, char __q16, + char __q15, char __q14, char __q13, char __q12, + char __q11, char __q10, char __q09, char __q08, + char __q07, char __q06, char __q05, char __q04, + char __q03, char __q02, char __q01, char __q00) +{ + return _mm256_set_epi8 (__q00, __q01, __q02, __q03, + __q04, __q05, __q06, __q07, + __q08, __q09, __q10, __q11, + __q12, __q13, __q14, __q15, + __q16, __q17, __q18, __q19, + __q20, __q21, __q22, __q23, + __q24, __q25, __q26, __q27, + __q28, __q29, __q30, __q31); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setr_epi64x (long long __A, long long __B, long long __C, + long long __D) +{ + return _mm256_set_epi64x (__D, __C, __B, __A); +} + +/* Casts between various SP, DP, INT vector types. Note that these do no + conversion of values, they just change the type. */ +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castpd_ps (__m256d __A) +{ + return (__m256) __A; +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castpd_si256 (__m256d __A) +{ + return (__m256i) __A; +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castps_pd (__m256 __A) +{ + return (__m256d) __A; +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castps_si256(__m256 __A) +{ + return (__m256i) __A; +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castsi256_ps (__m256i __A) +{ + return (__m256) __A; +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castsi256_pd (__m256i __A) +{ + return (__m256d) __A; +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castpd256_pd128 (__m256d __A) +{ + return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castps256_ps128 (__m256 __A) +{ + return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castsi256_si128 (__m256i __A) +{ + return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A); +} + +/* When cast is done from a 128 to 256-bit type, the low 128 bits of + the 256-bit result contain source parameter value and the upper 128 + bits of the result are undefined. Those intrinsics shouldn't + generate any extra moves. */ + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castpd128_pd256 (__m128d __A) +{ + return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castps128_ps256 (__m128 __A) +{ + return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_castsi128_si256 (__m128i __A) +{ + return (__m256i) __builtin_ia32_si256_si ((__v4si)__A); +} + +/* Similarly, but with zero extension instead of undefined values. */ + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_zextpd128_pd256 (__m128d __A) +{ + return _mm256_insertf128_pd (_mm256_setzero_pd (), __A, 0); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_zextps128_ps256 (__m128 __A) +{ + return _mm256_insertf128_ps (_mm256_setzero_ps (), __A, 0); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_zextsi128_si256 (__m128i __A) +{ + return _mm256_insertf128_si256 (_mm256_setzero_si256 (), __A, 0); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set_m128 ( __m128 __H, __m128 __L) +{ + return _mm256_insertf128_ps (_mm256_castps128_ps256 (__L), __H, 1); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set_m128d (__m128d __H, __m128d __L) +{ + return _mm256_insertf128_pd (_mm256_castpd128_pd256 (__L), __H, 1); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_set_m128i (__m128i __H, __m128i __L) +{ + return _mm256_insertf128_si256 (_mm256_castsi128_si256 (__L), __H, 1); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setr_m128 (__m128 __L, __m128 __H) +{ + return _mm256_set_m128 (__H, __L); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setr_m128d (__m128d __L, __m128d __H) +{ + return _mm256_set_m128d (__H, __L); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_setr_m128i (__m128i __L, __m128i __H) +{ + return _mm256_set_m128i (__H, __L); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu2_m128 (float const *__PH, float const *__PL) +{ + return _mm256_insertf128_ps (_mm256_castps128_ps256 (_mm_loadu_ps (__PL)), + _mm_loadu_ps (__PH), 1); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_storeu2_m128 (float *__PH, float *__PL, __m256 __A) +{ + _mm_storeu_ps (__PL, _mm256_castps256_ps128 (__A)); + _mm_storeu_ps (__PH, _mm256_extractf128_ps (__A, 1)); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu2_m128d (double const *__PH, double const *__PL) +{ + return _mm256_insertf128_pd (_mm256_castpd128_pd256 (_mm_loadu_pd (__PL)), + _mm_loadu_pd (__PH), 1); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_storeu2_m128d (double *__PH, double *__PL, __m256d __A) +{ + _mm_storeu_pd (__PL, _mm256_castpd256_pd128 (__A)); + _mm_storeu_pd (__PH, _mm256_extractf128_pd (__A, 1)); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_loadu2_m128i (__m128i_u const *__PH, __m128i_u const *__PL) +{ + return _mm256_insertf128_si256 (_mm256_castsi128_si256 (_mm_loadu_si128 (__PL)), + _mm_loadu_si128 (__PH), 1); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_storeu2_m128i (__m128i_u *__PH, __m128i_u *__PL, __m256i __A) +{ + _mm_storeu_si128 (__PL, _mm256_castsi256_si128 (__A)); + _mm_storeu_si128 (__PH, _mm256_extractf128_si256 (__A, 1)); +} + +#ifdef __DISABLE_AVX__ +#undef __DISABLE_AVX__ +#pragma GCC pop_options +#endif /* __DISABLE_AVX__ */ + +#endif /* _AVXINTRIN_H_INCLUDED */ diff --git a/include-gcc/avxneconvertintrin.h b/include-gcc/avxneconvertintrin.h new file mode 100644 index 0000000..7a90ae1 --- /dev/null +++ b/include-gcc/avxneconvertintrin.h @@ -0,0 +1,140 @@ +/* Copyright (C) 2021-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVXNECONVERTINTRIN_H_INCLUDED +#define _AVXNECONVERTINTRIN_H_INCLUDED + +#ifndef __AVXNECONVERT__ +#pragma GCC push_options +#pragma GCC target ("avxneconvert") +#define __DISABLE_AVXNECONVERT__ +#endif /* __AVXNECONVERT__ */ + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_bcstnebf16_ps (const void *__P) +{ + return (__m128) __builtin_ia32_vbcstnebf162ps128 ((const __bf16 *) __P); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_bcstnebf16_ps (const void *__P) +{ + return (__m256) __builtin_ia32_vbcstnebf162ps256 ((const __bf16 *) __P); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_bcstnesh_ps (const void *__P) +{ + return (__m128) __builtin_ia32_vbcstnesh2ps128 ((const _Float16 *) __P); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_bcstnesh_ps (const void *__P) +{ + return (__m256) __builtin_ia32_vbcstnesh2ps256 ((const _Float16 *) __P); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtneebf16_ps (const __m128bh *__A) +{ + return (__m128) __builtin_ia32_vcvtneebf162ps128 ((const __v8bf *) __A); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtneebf16_ps (const __m256bh *__A) +{ + return (__m256) __builtin_ia32_vcvtneebf162ps256 ((const __v16bf *) __A); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtneeph_ps (const __m128h *__A) +{ + return (__m128) __builtin_ia32_vcvtneeph2ps128 ((const __v8hf *) __A); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtneeph_ps (const __m256h *__A) +{ + return (__m256) __builtin_ia32_vcvtneeph2ps256 ((const __v16hf *) __A); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtneobf16_ps (const __m128bh *__A) +{ + return (__m128) __builtin_ia32_vcvtneobf162ps128 ((const __v8bf *) __A); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtneobf16_ps (const __m256bh *__A) +{ + return (__m256) __builtin_ia32_vcvtneobf162ps256 ((const __v16bf *) __A); +} + +extern __inline __m128 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtneoph_ps (const __m128h *__A) +{ + return (__m128) __builtin_ia32_vcvtneoph2ps128 ((const __v8hf *) __A); +} + +extern __inline __m256 +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtneoph_ps (const __m256h *__A) +{ + return (__m256) __builtin_ia32_vcvtneoph2ps256 ((const __v16hf *) __A); +} + +extern __inline __m128bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtneps_avx_pbh (__m128 __A) +{ + return (__m128bh) __builtin_ia32_cvtneps2bf16_v4sf (__A); +} + +extern __inline __m128bh +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtneps_avx_pbh (__m256 __A) +{ + return (__m128bh) __builtin_ia32_cvtneps2bf16_v8sf (__A); +} + +#ifdef __DISABLE_AVXNECONVERT__ +#undef __DISABLE_AVXNECONVERT__ +#pragma GCC pop_options +#endif /* __DISABLE_AVXNECONVERT__ */ + +#endif /* _AVXNECONVERTINTRIN_H_INCLUDED */ diff --git a/include-gcc/avxvnniint8intrin.h b/include-gcc/avxvnniint8intrin.h new file mode 100644 index 0000000..9f8f174 --- /dev/null +++ b/include-gcc/avxvnniint8intrin.h @@ -0,0 +1,138 @@ +/* Copyright (C) 2020-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#if !defined _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVXVNNIINT8INTRIN_H_INCLUDED +#define _AVXVNNIINT8INTRIN_H_INCLUDED + +#if !defined(__AVXVNNIINT8__) +#pragma GCC push_options +#pragma GCC target("avxvnniint8") +#define __DISABLE_AVXVNNIINT8__ +#endif /* __AVXVNNIINT8__ */ + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_dpbssd_epi32 (__m128i __W, __m128i __A, __m128i __B) +{ + return (__m128i) + __builtin_ia32_vpdpbssd128 ((__v4si) __W, (__v4si) __A, (__v4si) __B); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_dpbssds_epi32 (__m128i __W, __m128i __A, __m128i __B) +{ + return (__m128i) + __builtin_ia32_vpdpbssds128 ((__v4si) __W, (__v4si) __A, (__v4si) __B); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_dpbsud_epi32 (__m128i __W, __m128i __A, __m128i __B) +{ + return (__m128i) + __builtin_ia32_vpdpbsud128 ((__v4si) __W, (__v4si) __A, (__v4si) __B); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_dpbsuds_epi32 (__m128i __W, __m128i __A, __m128i __B) +{ + return (__m128i) + __builtin_ia32_vpdpbsuds128 ((__v4si) __W, (__v4si) __A, (__v4si) __B); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_dpbuud_epi32 (__m128i __W, __m128i __A, __m128i __B) +{ + return (__m128i) + __builtin_ia32_vpdpbuud128 ((__v4si) __W, (__v4si) __A, (__v4si) __B); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_dpbuuds_epi32 (__m128i __W, __m128i __A, __m128i __B) +{ + return (__m128i) + __builtin_ia32_vpdpbuuds128 ((__v4si) __W, (__v4si) __A, (__v4si) __B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_dpbssd_epi32 (__m256i __W, __m256i __A, __m256i __B) +{ + return (__m256i) + __builtin_ia32_vpdpbssd256 ((__v8si) __W, (__v8si) __A, (__v8si) __B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_dpbssds_epi32 (__m256i __W, __m256i __A, __m256i __B) +{ + return (__m256i) + __builtin_ia32_vpdpbssds256 ((__v8si) __W, (__v8si) __A, (__v8si) __B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_dpbsud_epi32 (__m256i __W, __m256i __A, __m256i __B) +{ + return (__m256i) + __builtin_ia32_vpdpbsud256 ((__v8si) __W, (__v8si) __A, (__v8si) __B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_dpbsuds_epi32 (__m256i __W, __m256i __A, __m256i __B) +{ + return (__m256i) + __builtin_ia32_vpdpbsuds256 ((__v8si) __W, (__v8si) __A, (__v8si) __B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_dpbuud_epi32 (__m256i __W, __m256i __A, __m256i __B) +{ + return (__m256i) + __builtin_ia32_vpdpbuud256 ((__v8si) __W, (__v8si) __A, (__v8si) __B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_dpbuuds_epi32 (__m256i __W, __m256i __A, __m256i __B) +{ + return (__m256i) + __builtin_ia32_vpdpbuuds256 ((__v8si) __W, (__v8si) __A, (__v8si) __B); +} + +#ifdef __DISABLE_AVXVNNIINT8__ +#undef __DISABLE_AVXVNNIINT8__ +#pragma GCC pop_options +#endif /* __DISABLE_AVXVNNIINT8__ */ + +#endif /* __AVXVNNIINT8INTRIN_H_INCLUDED */ diff --git a/include-gcc/avxvnniintrin.h b/include-gcc/avxvnniintrin.h new file mode 100644 index 0000000..cdea8a9 --- /dev/null +++ b/include-gcc/avxvnniintrin.h @@ -0,0 +1,113 @@ +/* Copyright (C) 2020-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _AVXVNNIINTRIN_H_INCLUDED +#define _AVXVNNIINTRIN_H_INCLUDED + +#if !defined(__AVXVNNI__) +#pragma GCC push_options +#pragma GCC target("avxvnni") +#define __DISABLE_AVXVNNIVL__ +#endif /* __AVXVNNIVL__ */ + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_dpbusd_avx_epi32(__m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i) __builtin_ia32_vpdpbusd_v8si ((__v8si) __A, + (__v8si) __B, + (__v8si) __C); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_dpbusd_avx_epi32(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpdpbusd_v4si ((__v4si) __A, + (__v4si) __B, + (__v4si) __C); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_dpbusds_avx_epi32(__m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i) __builtin_ia32_vpdpbusds_v8si ((__v8si) __A, + (__v8si) __B, + (__v8si) __C); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_dpbusds_avx_epi32(__m128i __A,__m128i __B,__m128i __C) +{ + return (__m128i) __builtin_ia32_vpdpbusds_v4si ((__v4si) __A, + (__v4si) __B, + (__v4si) __C); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_dpwssd_avx_epi32(__m256i __A,__m256i __B,__m256i __C) +{ + return (__m256i) __builtin_ia32_vpdpwssd_v8si ((__v8si) __A, + (__v8si) __B, + (__v8si) __C); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_dpwssd_avx_epi32(__m128i __A,__m128i __B,__m128i __C) +{ + return (__m128i) __builtin_ia32_vpdpwssd_v4si ((__v4si) __A, + (__v4si) __B, + (__v4si) __C); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_dpwssds_avx_epi32(__m256i __A,__m256i __B,__m256i __C) +{ + return (__m256i) __builtin_ia32_vpdpwssds_v8si ((__v8si) __A, + (__v8si) __B, + (__v8si) __C); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_dpwssds_avx_epi32(__m128i __A,__m128i __B,__m128i __C) +{ + return (__m128i) __builtin_ia32_vpdpwssds_v4si ((__v4si) __A, + (__v4si) __B, + (__v4si) __C); +} + +#ifdef __DISABLE_AVXVNNIVL__ +#undef __DISABLE_AVXVNNIVL__ +#pragma GCC pop_options +#endif /* __DISABLE_AVXVNNIVL__ */ +#endif /* _AVXVNNIINTRIN_H_INCLUDED */ diff --git a/include-gcc/bmi2intrin.h b/include-gcc/bmi2intrin.h new file mode 100644 index 0000000..c9915a5 --- /dev/null +++ b/include-gcc/bmi2intrin.h @@ -0,0 +1,109 @@ +/* Copyright (C) 2011-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _BMI2INTRIN_H_INCLUDED +#define _BMI2INTRIN_H_INCLUDED + +#ifndef __BMI2__ +#pragma GCC push_options +#pragma GCC target("bmi2") +#define __DISABLE_BMI2__ +#endif /* __BMI2__ */ + +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_bzhi_u32 (unsigned int __X, unsigned int __Y) +{ + return __builtin_ia32_bzhi_si (__X, __Y); +} + +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_pdep_u32 (unsigned int __X, unsigned int __Y) +{ + return __builtin_ia32_pdep_si (__X, __Y); +} + +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_pext_u32 (unsigned int __X, unsigned int __Y) +{ + return __builtin_ia32_pext_si (__X, __Y); +} + +#ifdef __x86_64__ + +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_bzhi_u64 (unsigned long long __X, unsigned long long __Y) +{ + return __builtin_ia32_bzhi_di (__X, __Y); +} + +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_pdep_u64 (unsigned long long __X, unsigned long long __Y) +{ + return __builtin_ia32_pdep_di (__X, __Y); +} + +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_pext_u64 (unsigned long long __X, unsigned long long __Y) +{ + return __builtin_ia32_pext_di (__X, __Y); +} + +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mulx_u64 (unsigned long long __X, unsigned long long __Y, + unsigned long long *__P) +{ + unsigned __int128 __res = (unsigned __int128) __X * __Y; + *__P = (unsigned long long) (__res >> 64); + return (unsigned long long) __res; +} + +#else /* !__x86_64__ */ + +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P) +{ + unsigned long long __res = (unsigned long long) __X * __Y; + *__P = (unsigned int) (__res >> 32); + return (unsigned int) __res; +} + +#endif /* !__x86_64__ */ + +#ifdef __DISABLE_BMI2__ +#undef __DISABLE_BMI2__ +#pragma GCC pop_options +#endif /* __DISABLE_BMI2__ */ + +#endif /* _BMI2INTRIN_H_INCLUDED */ diff --git a/include-gcc/bmiintrin.h b/include-gcc/bmiintrin.h new file mode 100644 index 0000000..ec8945d --- /dev/null +++ b/include-gcc/bmiintrin.h @@ -0,0 +1,202 @@ +/* Copyright (C) 2010-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _BMIINTRIN_H_INCLUDED +#define _BMIINTRIN_H_INCLUDED + +#ifndef __BMI__ +#pragma GCC push_options +#pragma GCC target("bmi") +#define __DISABLE_BMI__ +#endif /* __BMI__ */ + +extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__tzcnt_u16 (unsigned short __X) +{ + return __builtin_ia32_tzcnt_u16 (__X); +} + +extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_tzcnt_u16 (unsigned short __X) +{ + return __builtin_ia32_tzcnt_u16 (__X); +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__andn_u32 (unsigned int __X, unsigned int __Y) +{ + return ~__X & __Y; +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_andn_u32 (unsigned int __X, unsigned int __Y) +{ + return __andn_u32 (__X, __Y); +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__bextr_u32 (unsigned int __X, unsigned int __Y) +{ + return __builtin_ia32_bextr_u32 (__X, __Y); +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_bextr_u32 (unsigned int __X, unsigned int __Y, unsigned __Z) +{ + return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8))); +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blsi_u32 (unsigned int __X) +{ + return __X & -__X; +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_blsi_u32 (unsigned int __X) +{ + return __blsi_u32 (__X); +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blsmsk_u32 (unsigned int __X) +{ + return __X ^ (__X - 1); +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_blsmsk_u32 (unsigned int __X) +{ + return __blsmsk_u32 (__X); +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blsr_u32 (unsigned int __X) +{ + return __X & (__X - 1); +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_blsr_u32 (unsigned int __X) +{ + return __blsr_u32 (__X); +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__tzcnt_u32 (unsigned int __X) +{ + return __builtin_ia32_tzcnt_u32 (__X); +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_tzcnt_u32 (unsigned int __X) +{ + return __builtin_ia32_tzcnt_u32 (__X); +} + + +#ifdef __x86_64__ +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__andn_u64 (unsigned long long __X, unsigned long long __Y) +{ + return ~__X & __Y; +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_andn_u64 (unsigned long long __X, unsigned long long __Y) +{ + return __andn_u64 (__X, __Y); +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__bextr_u64 (unsigned long long __X, unsigned long long __Y) +{ + return __builtin_ia32_bextr_u64 (__X, __Y); +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_bextr_u64 (unsigned long long __X, unsigned int __Y, unsigned int __Z) +{ + return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8))); +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blsi_u64 (unsigned long long __X) +{ + return __X & -__X; +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_blsi_u64 (unsigned long long __X) +{ + return __blsi_u64 (__X); +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blsmsk_u64 (unsigned long long __X) +{ + return __X ^ (__X - 1); +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_blsmsk_u64 (unsigned long long __X) +{ + return __blsmsk_u64 (__X); +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blsr_u64 (unsigned long long __X) +{ + return __X & (__X - 1); +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_blsr_u64 (unsigned long long __X) +{ + return __blsr_u64 (__X); +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__tzcnt_u64 (unsigned long long __X) +{ + return __builtin_ia32_tzcnt_u64 (__X); +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_tzcnt_u64 (unsigned long long __X) +{ + return __builtin_ia32_tzcnt_u64 (__X); +} + +#endif /* __x86_64__ */ + +#ifdef __DISABLE_BMI__ +#undef __DISABLE_BMI__ +#pragma GCC pop_options +#endif /* __DISABLE_BMI__ */ + +#endif /* _BMIINTRIN_H_INCLUDED */ diff --git a/include-gcc/cetintrin.h b/include-gcc/cetintrin.h new file mode 100644 index 0000000..db21a4c --- /dev/null +++ b/include-gcc/cetintrin.h @@ -0,0 +1,129 @@ +/* Copyright (C) 2015-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _CETINTRIN_H_INCLUDED +#define _CETINTRIN_H_INCLUDED + +#ifndef __SHSTK__ +#pragma GCC push_options +#pragma GCC target ("shstk") +#define __DISABLE_SHSTK__ +#endif /* __SHSTK__ */ + +#ifdef __x86_64__ +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_get_ssp (void) +{ + return __builtin_ia32_rdsspq (); +} +#else +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_get_ssp (void) +{ + return __builtin_ia32_rdsspd (); +} +#endif + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_inc_ssp (unsigned int __B) +{ +#ifdef __x86_64__ + __builtin_ia32_incsspq ((unsigned long long) __B); +#else + __builtin_ia32_incsspd (__B); +#endif +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_saveprevssp (void) +{ + __builtin_ia32_saveprevssp (); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_rstorssp (void *__B) +{ + __builtin_ia32_rstorssp (__B); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_wrssd (unsigned int __B, void *__C) +{ + __builtin_ia32_wrssd (__B, __C); +} + +#ifdef __x86_64__ +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_wrssq (unsigned long long __B, void *__C) +{ + __builtin_ia32_wrssq (__B, __C); +} +#endif + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_wrussd (unsigned int __B, void *__C) +{ + __builtin_ia32_wrussd (__B, __C); +} + +#ifdef __x86_64__ +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_wrussq (unsigned long long __B, void *__C) +{ + __builtin_ia32_wrussq (__B, __C); +} +#endif + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_setssbsy (void) +{ + __builtin_ia32_setssbsy (); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_clrssbsy (void *__B) +{ + __builtin_ia32_clrssbsy (__B); +} + +#ifdef __DISABLE_SHSTK__ +#undef __DISABLE_SHSTK__ +#pragma GCC pop_options +#endif /* __DISABLE_SHSTK__ */ + +#endif /* _CETINTRIN_H_INCLUDED. */ diff --git a/include-gcc/cldemoteintrin.h b/include-gcc/cldemoteintrin.h new file mode 100644 index 0000000..0641f67 --- /dev/null +++ b/include-gcc/cldemoteintrin.h @@ -0,0 +1,47 @@ +/* Copyright (C) 2018-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _CLDEMOTE_H_INCLUDED +#define _CLDEMOTE_H_INCLUDED + +#ifndef __CLDEMOTE__ +#pragma GCC push_options +#pragma GCC target("cldemote") +#define __DISABLE_CLDEMOTE__ +#endif /* __CLDEMOTE__ */ +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_cldemote (void *__A) +{ + __builtin_ia32_cldemote (__A); +} +#ifdef __DISABLE_CLDEMOTE__ +#undef __DISABLE_CLDEMOTE__ +#pragma GCC pop_options +#endif /* __DISABLE_CLDEMOTE__ */ + +#endif /* _CLDEMOTE_H_INCLUDED */ diff --git a/include-gcc/clflushoptintrin.h b/include-gcc/clflushoptintrin.h new file mode 100644 index 0000000..8fc45df --- /dev/null +++ b/include-gcc/clflushoptintrin.h @@ -0,0 +1,49 @@ +/* Copyright (C) 2013-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _CLFLUSHOPTINTRIN_H_INCLUDED +#define _CLFLUSHOPTINTRIN_H_INCLUDED + +#ifndef __CLFLUSHOPT__ +#pragma GCC push_options +#pragma GCC target("clflushopt") +#define __DISABLE_CLFLUSHOPT__ +#endif /* __CLFLUSHOPT__ */ + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_clflushopt (void *__A) +{ + __builtin_ia32_clflushopt (__A); +} + +#ifdef __DISABLE_CLFLUSHOPT__ +#undef __DISABLE_CLFLUSHOPT__ +#pragma GCC pop_options +#endif /* __DISABLE_CLFLUSHOPT__ */ + +#endif /* _CLFLUSHOPTINTRIN_H_INCLUDED */ diff --git a/include-gcc/clwbintrin.h b/include-gcc/clwbintrin.h new file mode 100644 index 0000000..ef89b03 --- /dev/null +++ b/include-gcc/clwbintrin.h @@ -0,0 +1,49 @@ +/* Copyright (C) 2013-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _CLWBINTRIN_H_INCLUDED +#define _CLWBINTRIN_H_INCLUDED + +#ifndef __CLWB__ +#pragma GCC push_options +#pragma GCC target("clwb") +#define __DISABLE_CLWB__ +#endif /* __CLWB__ */ + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_clwb (void *__A) +{ + __builtin_ia32_clwb (__A); +} + +#ifdef __DISABLE_CLWB__ +#undef __DISABLE_CLWB__ +#pragma GCC pop_options +#endif /* __DISABLE_CLWB__ */ + +#endif /* _CLWBINTRIN_H_INCLUDED */ diff --git a/include-gcc/clzerointrin.h b/include-gcc/clzerointrin.h new file mode 100644 index 0000000..552ec5d --- /dev/null +++ b/include-gcc/clzerointrin.h @@ -0,0 +1,44 @@ +/* Copyright (C) 2012-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _CLZEROINTRIN_H_INCLUDED +#define _CLZEROINTRIN_H_INCLUDED + +#ifndef __CLZERO__ +#pragma GCC push_options +#pragma GCC target("clzero") +#define __DISABLE_CLZERO__ +#endif /* __CLZERO__ */ + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_clzero (void * __I) +{ + __builtin_ia32_clzero (__I); +} + +#ifdef __DISABLE_CLZERO__ +#undef __DISABLE_CLZERO__ +#pragma GCC pop_options +#endif /* __DISABLE_CLZERO__ */ + +#endif /* _CLZEROINTRIN_H_INCLUDED */ diff --git a/include-gcc/cmpccxaddintrin.h b/include-gcc/cmpccxaddintrin.h new file mode 100644 index 0000000..c458a9d --- /dev/null +++ b/include-gcc/cmpccxaddintrin.h @@ -0,0 +1,89 @@ +/* Copyright (C) 2012-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _CMPCCXADDINTRIN_H_INCLUDED +#define _CMPCCXADDINTRIN_H_INCLUDED + +#ifdef __x86_64__ + +#ifndef __CMPCCXADD__ +#pragma GCC push_options +#pragma GCC target("cmpccxadd") +#define __DISABLE_CMPCCXADD__ +#endif /* __CMPCCXADD__ */ + +typedef enum { + _CMPCCX_O, /* Overflow. */ + _CMPCCX_NO, /* No overflow. */ + _CMPCCX_B, /* Below. */ + _CMPCCX_NB, /* Not below. */ + _CMPCCX_Z, /* Zero. */ + _CMPCCX_NZ, /* Not zero. */ + _CMPCCX_BE, /* Below or equal. */ + _CMPCCX_NBE, /* Neither below nor equal. */ + _CMPCCX_S, /* Sign. */ + _CMPCCX_NS, /* No sign. */ + _CMPCCX_P, /* Parity. */ + _CMPCCX_NP, /* No parity. */ + _CMPCCX_L, /* Less. */ + _CMPCCX_NL, /* Not less. */ + _CMPCCX_LE, /* Less or equal. */ + _CMPCCX_NLE, /* Neither less nor equal. */ +} _CMPCCX_ENUM; + +#ifdef __OPTIMIZE__ +extern __inline int +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_cmpccxadd_epi32 (int *__A, int __B, int __C, const _CMPCCX_ENUM __D) +{ + return __builtin_ia32_cmpccxadd (__A, __B, __C, __D); +} + +extern __inline long long +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_cmpccxadd_epi64 (long long *__A, long long __B, long long __C, + const _CMPCCX_ENUM __D) +{ + return __builtin_ia32_cmpccxadd64 (__A, __B, __C, __D); +} +#else +#define _cmpccxadd_epi32(A,B,C,D) \ + __builtin_ia32_cmpccxadd ((int *) (A), (int) (B), (int) (C), \ + (_CMPCCX_ENUM) (D)) +#define _cmpccxadd_epi64(A,B,C,D) \ + __builtin_ia32_cmpccxadd64 ((long long *) (A), (long long) (B), \ + (long long) (C), (_CMPCCX_ENUM) (D)) +#endif + +#ifdef __DISABLE_CMPCCXADD__ +#undef __DISABLE_CMPCCXADD__ +#pragma GCC pop_options +#endif /* __DISABLE_CMPCCXADD__ */ + +#endif + +#endif /* _CMPCCXADDINTRIN_H_INCLUDED */ diff --git a/include-gcc/emmintrin.h b/include-gcc/emmintrin.h new file mode 100644 index 0000000..3599be7 --- /dev/null +++ b/include-gcc/emmintrin.h @@ -0,0 +1,1608 @@ +/* Copyright (C) 2003-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* Implemented from the specification included in the Intel C++ Compiler + User Guide and Reference, version 9.0. */ + +#ifndef _EMMINTRIN_H_INCLUDED +#define _EMMINTRIN_H_INCLUDED + +/* We need definitions from the SSE header files*/ +#include + +#ifndef __SSE2__ +#pragma GCC push_options +#pragma GCC target("sse2") +#define __DISABLE_SSE2__ +#endif /* __SSE2__ */ + +/* SSE2 */ +typedef double __v2df __attribute__ ((__vector_size__ (16))); +typedef long long __v2di __attribute__ ((__vector_size__ (16))); +typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16))); +typedef int __v4si __attribute__ ((__vector_size__ (16))); +typedef unsigned int __v4su __attribute__ ((__vector_size__ (16))); +typedef short __v8hi __attribute__ ((__vector_size__ (16))); +typedef unsigned short __v8hu __attribute__ ((__vector_size__ (16))); +typedef char __v16qi __attribute__ ((__vector_size__ (16))); +typedef signed char __v16qs __attribute__ ((__vector_size__ (16))); +typedef unsigned char __v16qu __attribute__ ((__vector_size__ (16))); + +/* The Intel API is flexible enough that we must allow aliasing with other + vector types, and their scalar components. */ +typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__)); +typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__)); + +/* Unaligned version of the same types. */ +typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); +typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); + +/* Create a selector for use with the SHUFPD instruction. */ +#define _MM_SHUFFLE2(fp1,fp0) \ + (((fp1) << 1) | (fp0)) + +/* Create a vector with element 0 as F and the rest zero. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_sd (double __F) +{ + return __extension__ (__m128d){ __F, 0.0 }; +} + +/* Create a vector with both elements equal to F. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_pd (double __F) +{ + return __extension__ (__m128d){ __F, __F }; +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_pd1 (double __F) +{ + return _mm_set1_pd (__F); +} + +/* Create a vector with the lower value X and upper value W. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_pd (double __W, double __X) +{ + return __extension__ (__m128d){ __X, __W }; +} + +/* Create a vector with the lower value W and upper value X. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_pd (double __W, double __X) +{ + return __extension__ (__m128d){ __W, __X }; +} + +/* Create an undefined vector. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_undefined_pd (void) +{ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Winit-self" + __m128d __Y = __Y; +#pragma GCC diagnostic pop + return __Y; +} + +/* Create a vector of zeros. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setzero_pd (void) +{ + return __extension__ (__m128d){ 0.0, 0.0 }; +} + +/* Sets the low DPFP value of A from the low value of B. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_move_sd (__m128d __A, __m128d __B) +{ + return __extension__ (__m128d) __builtin_shuffle ((__v2df)__A, (__v2df)__B, (__v2di){2, 1}); +} + +/* Load two DPFP values from P. The address must be 16-byte aligned. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_pd (double const *__P) +{ + return *(__m128d *)__P; +} + +/* Load two DPFP values from P. The address need not be 16-byte aligned. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_pd (double const *__P) +{ + return *(__m128d_u *)__P; +} + +/* Create a vector with all two elements equal to *P. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load1_pd (double const *__P) +{ + return _mm_set1_pd (*__P); +} + +/* Create a vector with element 0 as *P and the rest zero. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_sd (double const *__P) +{ + return _mm_set_sd (*__P); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_pd1 (double const *__P) +{ + return _mm_load1_pd (__P); +} + +/* Load two DPFP values in reverse order. The address must be aligned. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadr_pd (double const *__P) +{ + __m128d __tmp = _mm_load_pd (__P); + return __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,1)); +} + +/* Store two DPFP values. The address must be 16-byte aligned. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_pd (double *__P, __m128d __A) +{ + *(__m128d *)__P = __A; +} + +/* Store two DPFP values. The address need not be 16-byte aligned. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_pd (double *__P, __m128d __A) +{ + *(__m128d_u *)__P = __A; +} + +/* Stores the lower DPFP value. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_sd (double *__P, __m128d __A) +{ + *__P = ((__v2df)__A)[0]; +} + +extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_f64 (__m128d __A) +{ + return ((__v2df)__A)[0]; +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storel_pd (double *__P, __m128d __A) +{ + _mm_store_sd (__P, __A); +} + +/* Stores the upper DPFP value. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeh_pd (double *__P, __m128d __A) +{ + *__P = ((__v2df)__A)[1]; +} + +/* Store the lower DPFP value across two words. + The address must be 16-byte aligned. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store1_pd (double *__P, __m128d __A) +{ + _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,0))); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_pd1 (double *__P, __m128d __A) +{ + _mm_store1_pd (__P, __A); +} + +/* Store two DPFP values in reverse order. The address must be aligned. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storer_pd (double *__P, __m128d __A) +{ + _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,1))); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi128_si32 (__m128i __A) +{ + return __builtin_ia32_vec_ext_v4si ((__v4si)__A, 0); +} + +#ifdef __x86_64__ +/* Intel intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi128_si64 (__m128i __A) +{ + return ((__v2di)__A)[0]; +} + +/* Microsoft intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi128_si64x (__m128i __A) +{ + return ((__v2di)__A)[0]; +} +#endif + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_pd (__m128d __A, __m128d __B) +{ + return (__m128d) ((__v2df)__A + (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_addsd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_pd (__m128d __A, __m128d __B) +{ + return (__m128d) ((__v2df)__A - (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_subsd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_pd (__m128d __A, __m128d __B) +{ + return (__m128d) ((__v2df)__A * (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_mulsd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_div_pd (__m128d __A, __m128d __B) +{ + return (__m128d) ((__v2df)__A / (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_div_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_divsd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sqrt_pd (__m128d __A) +{ + return (__m128d)__builtin_ia32_sqrtpd ((__v2df)__A); +} + +/* Return pair {sqrt (B[0]), A[1]}. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sqrt_sd (__m128d __A, __m128d __B) +{ + __v2df __tmp = __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B); + return (__m128d)__builtin_ia32_sqrtsd ((__v2df)__tmp); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_minpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_minsd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_maxpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_maxsd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_and_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_andpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_andnot_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_andnpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_or_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_orpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_xor_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_xorpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpeqpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpltpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmplepd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpgtpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpgepd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpneqpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnlt_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpnltpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnle_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpnlepd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpngt_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpngtpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnge_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpngepd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpord_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpordpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpunord_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpunordpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpeqsd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpltsd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmplesd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_movsd ((__v2df) __A, + (__v2df) + __builtin_ia32_cmpltsd ((__v2df) __B, + (__v2df) + __A)); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_movsd ((__v2df) __A, + (__v2df) + __builtin_ia32_cmplesd ((__v2df) __B, + (__v2df) + __A)); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpneqsd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnlt_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpnltsd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnle_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpnlesd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpngt_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_movsd ((__v2df) __A, + (__v2df) + __builtin_ia32_cmpnltsd ((__v2df) __B, + (__v2df) + __A)); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnge_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_movsd ((__v2df) __A, + (__v2df) + __builtin_ia32_cmpnlesd ((__v2df) __B, + (__v2df) + __A)); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpord_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpordsd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpunord_sd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_cmpunordsd ((__v2df)__A, (__v2df)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comieq_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_comisdeq ((__v2df)__A, (__v2df)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comilt_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_comisdlt ((__v2df)__A, (__v2df)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comile_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_comisdle ((__v2df)__A, (__v2df)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comigt_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_comisdgt ((__v2df)__A, (__v2df)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comige_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_comisdge ((__v2df)__A, (__v2df)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comineq_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_comisdneq ((__v2df)__A, (__v2df)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomieq_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_ucomisdeq ((__v2df)__A, (__v2df)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomilt_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_ucomisdlt ((__v2df)__A, (__v2df)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomile_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_ucomisdle ((__v2df)__A, (__v2df)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomigt_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_ucomisdgt ((__v2df)__A, (__v2df)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomige_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_ucomisdge ((__v2df)__A, (__v2df)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomineq_sd (__m128d __A, __m128d __B) +{ + return __builtin_ia32_ucomisdneq ((__v2df)__A, (__v2df)__B); +} + +/* Create a vector of Qi, where i is the element number. */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_epi64x (long long __q1, long long __q0) +{ + return __extension__ (__m128i)(__v2di){ __q0, __q1 }; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_epi64 (__m64 __q1, __m64 __q0) +{ + return _mm_set_epi64x ((long long)__q1, (long long)__q0); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0) +{ + return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 }; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_epi16 (short __q7, short __q6, short __q5, short __q4, + short __q3, short __q2, short __q1, short __q0) +{ + return __extension__ (__m128i)(__v8hi){ + __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 }; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_epi8 (char __q15, char __q14, char __q13, char __q12, + char __q11, char __q10, char __q09, char __q08, + char __q07, char __q06, char __q05, char __q04, + char __q03, char __q02, char __q01, char __q00) +{ + return __extension__ (__m128i)(__v16qi){ + __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07, + __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15 + }; +} + +/* Set all of the elements of the vector to A. */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_epi64x (long long __A) +{ + return _mm_set_epi64x (__A, __A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_epi64 (__m64 __A) +{ + return _mm_set_epi64 (__A, __A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_epi32 (int __A) +{ + return _mm_set_epi32 (__A, __A, __A, __A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_epi16 (short __A) +{ + return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_epi8 (char __A) +{ + return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A, + __A, __A, __A, __A, __A, __A, __A, __A); +} + +/* Create a vector of Qi, where i is the element number. + The parameter order is reversed from the _mm_set_epi* functions. */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_epi64 (__m64 __q0, __m64 __q1) +{ + return _mm_set_epi64 (__q1, __q0); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3) +{ + return _mm_set_epi32 (__q3, __q2, __q1, __q0); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3, + short __q4, short __q5, short __q6, short __q7) +{ + return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03, + char __q04, char __q05, char __q06, char __q07, + char __q08, char __q09, char __q10, char __q11, + char __q12, char __q13, char __q14, char __q15) +{ + return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08, + __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00); +} + +/* Create a vector with element 0 as *P and the rest zero. */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_si128 (__m128i const *__P) +{ + return *__P; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_si128 (__m128i_u const *__P) +{ + return *__P; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadl_epi64 (__m128i_u const *__P) +{ + return _mm_set_epi64 ((__m64)0LL, *(__m64_u *)__P); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_si64 (void const *__P) +{ + return _mm_loadl_epi64 ((__m128i_u *)__P); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_si32 (void const *__P) +{ + return _mm_set_epi32 (0, 0, 0, (*(__m32_u *)__P)[0]); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_si16 (void const *__P) +{ + return _mm_set_epi16 (0, 0, 0, 0, 0, 0, 0, (*(__m16_u *)__P)[0]); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_si128 (__m128i *__P, __m128i __B) +{ + *__P = __B; +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_si128 (__m128i_u *__P, __m128i __B) +{ + *__P = __B; +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storel_epi64 (__m128i_u *__P, __m128i __B) +{ + *(__m64_u *)__P = (__m64) ((__v2di)__B)[0]; +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_si64 (void *__P, __m128i __B) +{ + _mm_storel_epi64 ((__m128i_u *)__P, __B); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_si32 (void *__P, __m128i __B) +{ + *(__m32_u *)__P = (__m32) ((__v4si)__B)[0]; +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_si16 (void *__P, __m128i __B) +{ + *(__m16_u *)__P = (__m16) ((__v8hi)__B)[0]; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movepi64_pi64 (__m128i __B) +{ + return (__m64) ((__v2di)__B)[0]; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movpi64_epi64 (__m64 __A) +{ + return _mm_set_epi64 ((__m64)0LL, __A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_move_epi64 (__m128i __A) +{ + return (__m128i)__builtin_ia32_movq128 ((__v2di) __A); +} + +/* Create an undefined vector. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_undefined_si128 (void) +{ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Winit-self" + __m128i __Y = __Y; +#pragma GCC diagnostic pop + return __Y; +} + +/* Create a vector of zeros. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setzero_si128 (void) +{ + return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 }; +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi32_pd (__m128i __A) +{ + return (__m128d)__builtin_ia32_cvtdq2pd ((__v4si) __A); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi32_ps (__m128i __A) +{ + return (__m128)__builtin_ia32_cvtdq2ps ((__v4si) __A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpd_epi32 (__m128d __A) +{ + return (__m128i)__builtin_ia32_cvtpd2dq ((__v2df) __A); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpd_pi32 (__m128d __A) +{ + return (__m64)__builtin_ia32_cvtpd2pi ((__v2df) __A); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpd_ps (__m128d __A) +{ + return (__m128)__builtin_ia32_cvtpd2ps ((__v2df) __A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttpd_epi32 (__m128d __A) +{ + return (__m128i)__builtin_ia32_cvttpd2dq ((__v2df) __A); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttpd_pi32 (__m128d __A) +{ + return (__m64)__builtin_ia32_cvttpd2pi ((__v2df) __A); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpi32_pd (__m64 __A) +{ + return (__m128d)__builtin_ia32_cvtpi2pd ((__v2si) __A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtps_epi32 (__m128 __A) +{ + return (__m128i)__builtin_ia32_cvtps2dq ((__v4sf) __A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttps_epi32 (__m128 __A) +{ + return (__m128i)__builtin_ia32_cvttps2dq ((__v4sf) __A); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtps_pd (__m128 __A) +{ + return (__m128d)__builtin_ia32_cvtps2pd ((__v4sf) __A); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_si32 (__m128d __A) +{ + return __builtin_ia32_cvtsd2si ((__v2df) __A); +} + +#ifdef __x86_64__ +/* Intel intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_si64 (__m128d __A) +{ + return __builtin_ia32_cvtsd2si64 ((__v2df) __A); +} + +/* Microsoft intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_si64x (__m128d __A) +{ + return __builtin_ia32_cvtsd2si64 ((__v2df) __A); +} +#endif + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsd_si32 (__m128d __A) +{ + return __builtin_ia32_cvttsd2si ((__v2df) __A); +} + +#ifdef __x86_64__ +/* Intel intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsd_si64 (__m128d __A) +{ + return __builtin_ia32_cvttsd2si64 ((__v2df) __A); +} + +/* Microsoft intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttsd_si64x (__m128d __A) +{ + return __builtin_ia32_cvttsd2si64 ((__v2df) __A); +} +#endif + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsd_ss (__m128 __A, __m128d __B) +{ + return (__m128)__builtin_ia32_cvtsd2ss ((__v4sf) __A, (__v2df) __B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi32_sd (__m128d __A, int __B) +{ + return (__m128d)__builtin_ia32_cvtsi2sd ((__v2df) __A, __B); +} + +#ifdef __x86_64__ +/* Intel intrinsic. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64_sd (__m128d __A, long long __B) +{ + return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B); +} + +/* Microsoft intrinsic. */ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64x_sd (__m128d __A, long long __B) +{ + return (__m128d)__builtin_ia32_cvtsi642sd ((__v2df) __A, __B); +} +#endif + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_sd (__m128d __A, __m128 __B) +{ + return (__m128d)__builtin_ia32_cvtss2sd ((__v2df) __A, (__v4sf)__B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask) +{ + return (__m128d)__builtin_ia32_shufpd ((__v2df)__A, (__v2df)__B, __mask); +} +#else +#define _mm_shuffle_pd(A, B, N) \ + ((__m128d)__builtin_ia32_shufpd ((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(N))) +#endif + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_unpckhpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_pd (__m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_unpcklpd ((__v2df)__A, (__v2df)__B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadh_pd (__m128d __A, double const *__B) +{ + return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, __B); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadl_pd (__m128d __A, double const *__B) +{ + return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, __B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movemask_pd (__m128d __A) +{ + return __builtin_ia32_movmskpd ((__v2df)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packs_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_packsswb128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packs_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_packssdw128 ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packus_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_packuswb128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_punpckhbw128 ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_punpckhwd128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_punpckhdq128 ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_punpckhqdq128 ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_punpcklbw128 ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_punpcklwd128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_punpckldq128 ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v16qu)__A + (__v16qu)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v8hu)__A + (__v8hu)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v4su)__A + (__v4su)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v2du)__A + (__v2du)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_paddsb128 ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_paddsw128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_epu8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_paddusb128 ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_epu16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_paddusw128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v16qu)__A - (__v16qu)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v8hu)__A - (__v8hu)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v4su)__A - (__v4su)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v2du)__A - (__v2du)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_subs_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psubsb128 ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_subs_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psubsw128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_subs_epu8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psubusb128 ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_subs_epu16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psubusw128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_madd_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pmaddwd128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mulhi_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pmulhw128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mullo_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v8hu)__A * (__v8hu)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_su32 (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pmuludq ((__v2si)__A, (__v2si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_epu32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pmuludq128 ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_slli_epi16 (__m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_psllwi128 ((__v8hi)__A, __B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_slli_epi32 (__m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_pslldi128 ((__v4si)__A, __B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_slli_epi64 (__m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_psllqi128 ((__v2di)__A, __B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srai_epi16 (__m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_psrawi128 ((__v8hi)__A, __B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srai_epi32 (__m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_psradi128 ((__v4si)__A, __B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_bsrli_si128 (__m128i __A, const int __N) +{ + return (__m128i)__builtin_ia32_psrldqi128 (__A, __N * 8); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_bslli_si128 (__m128i __A, const int __N) +{ + return (__m128i)__builtin_ia32_pslldqi128 (__A, __N * 8); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_si128 (__m128i __A, const int __N) +{ + return (__m128i)__builtin_ia32_psrldqi128 (__A, __N * 8); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_slli_si128 (__m128i __A, const int __N) +{ + return (__m128i)__builtin_ia32_pslldqi128 (__A, __N * 8); +} +#else +#define _mm_bsrli_si128(A, N) \ + ((__m128i)__builtin_ia32_psrldqi128 ((__m128i)(A), (int)(N) * 8)) +#define _mm_bslli_si128(A, N) \ + ((__m128i)__builtin_ia32_pslldqi128 ((__m128i)(A), (int)(N) * 8)) +#define _mm_srli_si128(A, N) \ + ((__m128i)__builtin_ia32_psrldqi128 ((__m128i)(A), (int)(N) * 8)) +#define _mm_slli_si128(A, N) \ + ((__m128i)__builtin_ia32_pslldqi128 ((__m128i)(A), (int)(N) * 8)) +#endif + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_epi16 (__m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_psrlwi128 ((__v8hi)__A, __B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_epi32 (__m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_psrldi128 ((__v4si)__A, __B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_epi64 (__m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_psrlqi128 ((__v2di)__A, __B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sll_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psllw128((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sll_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pslld128((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sll_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psllq128((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sra_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psraw128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sra_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psrad128 ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srl_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psrlw128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srl_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psrld128 ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srl_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psrlq128 ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_and_si128 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v2du)__A & (__v2du)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_andnot_si128 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pandn128 ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_or_si128 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v2du)__A | (__v2du)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_xor_si128 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v2du)__A ^ (__v2du)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v16qi)__A == (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v8hi)__A == (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v4si)__A == (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v16qs)__A < (__v16qs)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v8hi)__A < (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v4si)__A < (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v16qs)__A > (__v16qs)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v8hi)__A > (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i) ((__v4si)__A > (__v4si)__B); +} + +#ifdef __OPTIMIZE__ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_epi16 (__m128i const __A, int const __N) +{ + return (unsigned short) __builtin_ia32_vec_ext_v8hi ((__v8hi)__A, __N); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_epi16 (__m128i const __A, int const __D, int const __N) +{ + return (__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)__A, __D, __N); +} +#else +#define _mm_extract_epi16(A, N) \ + ((int) (unsigned short) __builtin_ia32_vec_ext_v8hi ((__v8hi)(__m128i)(A), (int)(N))) +#define _mm_insert_epi16(A, D, N) \ + ((__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)(__m128i)(A), \ + (int)(D), (int)(N))) +#endif + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pmaxsw128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epu8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pmaxub128 ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pminsw128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epu8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pminub128 ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movemask_epi8 (__m128i __A) +{ + return __builtin_ia32_pmovmskb128 ((__v16qi)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mulhi_epu16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pmulhuw128 ((__v8hi)__A, (__v8hi)__B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shufflehi_epi16 (__m128i __A, const int __mask) +{ + return (__m128i)__builtin_ia32_pshufhw ((__v8hi)__A, __mask); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shufflelo_epi16 (__m128i __A, const int __mask) +{ + return (__m128i)__builtin_ia32_pshuflw ((__v8hi)__A, __mask); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_epi32 (__m128i __A, const int __mask) +{ + return (__m128i)__builtin_ia32_pshufd ((__v4si)__A, __mask); +} +#else +#define _mm_shufflehi_epi16(A, N) \ + ((__m128i)__builtin_ia32_pshufhw ((__v8hi)(__m128i)(A), (int)(N))) +#define _mm_shufflelo_epi16(A, N) \ + ((__m128i)__builtin_ia32_pshuflw ((__v8hi)(__m128i)(A), (int)(N))) +#define _mm_shuffle_epi32(A, N) \ + ((__m128i)__builtin_ia32_pshufd ((__v4si)(__m128i)(A), (int)(N))) +#endif + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C) +{ + __builtin_ia32_maskmovdqu ((__v16qi)__A, (__v16qi)__B, __C); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_avg_epu8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pavgb128 ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_avg_epu16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_pavgw128 ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sad_epu8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psadbw128 ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_stream_si32 (int *__A, int __B) +{ + __builtin_ia32_movnti (__A, __B); +} + +#ifdef __x86_64__ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_stream_si64 (long long int *__A, long long int __B) +{ + __builtin_ia32_movnti64 (__A, __B); +} +#endif + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_stream_si128 (__m128i *__A, __m128i __B) +{ + __builtin_ia32_movntdq ((__v2di *)__A, (__v2di)__B); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_stream_pd (double *__A, __m128d __B) +{ + __builtin_ia32_movntpd (__A, (__v2df)__B); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_clflush (void const *__A) +{ + __builtin_ia32_clflush (__A); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_lfence (void) +{ + __builtin_ia32_lfence (); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mfence (void) +{ + __builtin_ia32_mfence (); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi32_si128 (int __A) +{ + return _mm_set_epi32 (0, 0, 0, __A); +} + +#ifdef __x86_64__ +/* Intel intrinsic. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64_si128 (long long __A) +{ + return _mm_set_epi64x (0, __A); +} + +/* Microsoft intrinsic. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64x_si128 (long long __A) +{ + return _mm_set_epi64x (0, __A); +} +#endif + +/* Casts between various SP, DP, INT vector types. Note that these do no + conversion of values, they just change the type. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castpd_ps(__m128d __A) +{ + return (__m128) __A; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castpd_si128(__m128d __A) +{ + return (__m128i) __A; +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castps_pd(__m128 __A) +{ + return (__m128d) __A; +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castps_si128(__m128 __A) +{ + return (__m128i) __A; +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castsi128_ps(__m128i __A) +{ + return (__m128) __A; +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_castsi128_pd(__m128i __A) +{ + return (__m128d) __A; +} + +#ifdef __DISABLE_SSE2__ +#undef __DISABLE_SSE2__ +#pragma GCC pop_options +#endif /* __DISABLE_SSE2__ */ + +#endif /* _EMMINTRIN_H_INCLUDED */ diff --git a/include-gcc/enqcmdintrin.h b/include-gcc/enqcmdintrin.h new file mode 100644 index 0000000..59682e2 --- /dev/null +++ b/include-gcc/enqcmdintrin.h @@ -0,0 +1,55 @@ +/* Copyright (C) 2019-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _ENQCMDINTRIN_H_INCLUDED +#define _ENQCMDINTRIN_H_INCLUDED + +#ifndef __ENQCMD__ +#pragma GCC push_options +#pragma GCC target ("enqcmd") +#define __DISABLE_ENQCMD__ +#endif /* __ENQCMD__ */ + +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_enqcmd (void * __P, const void * __Q) +{ + return __builtin_ia32_enqcmd (__P, __Q); +} + +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_enqcmds (void * __P, const void * __Q) +{ + return __builtin_ia32_enqcmds (__P, __Q); +} + +#ifdef __DISABLE_ENQCMD__ +#undef __DISABLE_ENQCMD__ +#pragma GCC pop_options +#endif /* __DISABLE_ENQCMD__ */ +#endif /* _ENQCMDINTRIN_H_INCLUDED. */ diff --git a/include-gcc/f16cintrin.h b/include-gcc/f16cintrin.h new file mode 100644 index 0000000..72c7c23 --- /dev/null +++ b/include-gcc/f16cintrin.h @@ -0,0 +1,98 @@ +/* Copyright (C) 2011-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#if !defined _X86INTRIN_H_INCLUDED && !defined _IMMINTRIN_H_INCLUDED +# error "Never use directly; include or instead." +#endif + +#ifndef _F16CINTRIN_H_INCLUDED +#define _F16CINTRIN_H_INCLUDED + +#ifndef __F16C__ +#pragma GCC push_options +#pragma GCC target("f16c") +#define __DISABLE_F16C__ +#endif /* __F16C__ */ + +extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_cvtsh_ss (unsigned short __S) +{ + __v8hi __H = __extension__ (__v8hi){ (short) __S, 0, 0, 0, 0, 0, 0, 0 }; + __v4sf __A = __builtin_ia32_vcvtph2ps (__H); + return __builtin_ia32_vec_ext_v4sf (__A, 0); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtph_ps (__m128i __A) +{ + return (__m128) __builtin_ia32_vcvtph2ps ((__v8hi) __A); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtph_ps (__m128i __A) +{ + return (__m256) __builtin_ia32_vcvtph2ps256 ((__v8hi) __A); +} + +#ifdef __OPTIMIZE__ +extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_cvtss_sh (float __F, const int __I) +{ + __v4sf __A = __extension__ (__v4sf){ __F, 0, 0, 0 }; + __v8hi __H = __builtin_ia32_vcvtps2ph (__A, __I); + return (unsigned short) __builtin_ia32_vec_ext_v8hi (__H, 0); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtps_ph (__m128 __A, const int __I) +{ + return (__m128i) __builtin_ia32_vcvtps2ph ((__v4sf) __A, __I); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cvtps_ph (__m256 __A, const int __I) +{ + return (__m128i) __builtin_ia32_vcvtps2ph256 ((__v8sf) __A, __I); +} +#else +#define _cvtss_sh(__F, __I) \ + (__extension__ \ + ({ \ + __v4sf __A = __extension__ (__v4sf){ __F, 0, 0, 0 }; \ + __v8hi __H = __builtin_ia32_vcvtps2ph (__A, __I); \ + (unsigned short) __builtin_ia32_vec_ext_v8hi (__H, 0); \ + })) + +#define _mm_cvtps_ph(A, I) \ + ((__m128i) __builtin_ia32_vcvtps2ph ((__v4sf)(__m128) (A), (int) (I))) + +#define _mm256_cvtps_ph(A, I) \ + ((__m128i) __builtin_ia32_vcvtps2ph256 ((__v8sf)(__m256) (A), (int) (I))) +#endif /* __OPTIMIZE */ + +#ifdef __DISABLE_F16C__ +#undef __DISABLE_F16C__ +#pragma GCC pop_options +#endif /* __DISABLE_F16C__ */ + +#endif /* _F16CINTRIN_H_INCLUDED */ diff --git a/include-gcc/fma4intrin.h b/include-gcc/fma4intrin.h new file mode 100644 index 0000000..e43a91f --- /dev/null +++ b/include-gcc/fma4intrin.h @@ -0,0 +1,241 @@ +/* Copyright (C) 2007-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86INTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _FMA4INTRIN_H_INCLUDED +#define _FMA4INTRIN_H_INCLUDED + +/* We need definitions from the SSE4A, SSE3, SSE2 and SSE header files. */ +#include + +#ifndef __FMA4__ +#pragma GCC push_options +#pragma GCC target("fma4") +#define __DISABLE_FMA4__ +#endif /* __FMA4__ */ + +/* 128b Floating point multiply/add type instructions. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_macc_ps (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_macc_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddpd ((__v2df)__A, (__v2df)__B, (__v2df)__C); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_macc_ss (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddss ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_macc_sd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsd ((__v2df)__A, (__v2df)__B, (__v2df)__C); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_msub_ps (__m128 __A, __m128 __B, __m128 __C) + +{ + return (__m128) __builtin_ia32_vfmaddps ((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_msub_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddpd ((__v2df)__A, (__v2df)__B, -(__v2df)__C); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_msub_ss (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddss ((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_msub_sd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsd ((__v2df)__A, (__v2df)__B, -(__v2df)__C); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_nmacc_ps (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddps (-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_nmacc_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddpd (-(__v2df)__A, (__v2df)__B, (__v2df)__C); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_nmacc_ss (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddss (-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_nmacc_sd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsd (-(__v2df)__A, (__v2df)__B, (__v2df)__C); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_nmsub_ps (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddps (-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_nmsub_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddpd (-(__v2df)__A, (__v2df)__B, -(__v2df)__C); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_nmsub_ss (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddss (-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_nmsub_sd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsd (-(__v2df)__A, (__v2df)__B, -(__v2df)__C); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maddsub_ps (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddsubps ((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maddsub_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsubpd ((__v2df)__A, (__v2df)__B, (__v2df)__C); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_msubadd_ps (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddsubps ((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_msubadd_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsubpd ((__v2df)__A, (__v2df)__B, -(__v2df)__C); +} + +/* 256b Floating point multiply/add type instructions. */ +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_macc_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_macc_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_msub_ps (__m256 __A, __m256 __B, __m256 __C) + +{ + return (__m256) __builtin_ia32_vfmaddps256 ((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_msub_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddpd256 ((__v4df)__A, (__v4df)__B, -(__v4df)__C); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_nmacc_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddps256 (-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_nmacc_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddpd256 (-(__v4df)__A, (__v4df)__B, (__v4df)__C); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_nmsub_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddps256 (-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_nmsub_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddpd256 (-(__v4df)__A, (__v4df)__B, -(__v4df)__C); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maddsub_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddsubps256 ((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maddsub_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddsubpd256 ((__v4df)__A, (__v4df)__B, (__v4df)__C); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_msubadd_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_vfmaddsubps256 ((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_msubadd_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_vfmaddsubpd256 ((__v4df)__A, (__v4df)__B, -(__v4df)__C); +} + +#ifdef __DISABLE_FMA4__ +#undef __DISABLE_FMA4__ +#pragma GCC pop_options +#endif /* __DISABLE_FMA4__ */ + +#endif diff --git a/include-gcc/fmaintrin.h b/include-gcc/fmaintrin.h new file mode 100644 index 0000000..f5d643e --- /dev/null +++ b/include-gcc/fmaintrin.h @@ -0,0 +1,302 @@ +/* Copyright (C) 2011-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _FMAINTRIN_H_INCLUDED +#define _FMAINTRIN_H_INCLUDED + +#ifndef __FMA__ +#pragma GCC push_options +#pragma GCC target("fma") +#define __DISABLE_FMA__ +#endif /* __FMA__ */ + +extern __inline __m128d +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmadd_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfmaddpd ((__v2df)__A, (__v2df)__B, + (__v2df)__C); +} + +extern __inline __m256d +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fmadd_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d)__builtin_ia32_vfmaddpd256 ((__v4df)__A, (__v4df)__B, + (__v4df)__C); +} + +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmadd_ps (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfmaddps ((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C); +} + +extern __inline __m256 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fmadd_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256)__builtin_ia32_vfmaddps256 ((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C); +} + +extern __inline __m128d +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmadd_sd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_vfmaddsd3 ((__v2df)__A, (__v2df)__B, + (__v2df)__C); +} + +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmadd_ss (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_vfmaddss3 ((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C); +} + +extern __inline __m128d +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmsub_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfmsubpd ((__v2df)__A, (__v2df)__B, + (__v2df)__C); +} + +extern __inline __m256d +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fmsub_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d)__builtin_ia32_vfmsubpd256 ((__v4df)__A, (__v4df)__B, + (__v4df)__C); +} + +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmsub_ps (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfmsubps ((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C); +} + +extern __inline __m256 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fmsub_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256)__builtin_ia32_vfmsubps256 ((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C); +} + +extern __inline __m128d +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmsub_sd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfmsubsd3 ((__v2df)__A, (__v2df)__B, + (__v2df)__C); +} + +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmsub_ss (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfmsubss3 ((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C); +} + +extern __inline __m128d +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmadd_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfnmaddpd ((__v2df)__A, (__v2df)__B, + (__v2df)__C); +} + +extern __inline __m256d +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fnmadd_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d)__builtin_ia32_vfnmaddpd256 ((__v4df)__A, (__v4df)__B, + (__v4df)__C); +} + +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmadd_ps (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfnmaddps ((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C); +} + +extern __inline __m256 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fnmadd_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256)__builtin_ia32_vfnmaddps256 ((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C); +} + +extern __inline __m128d +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmadd_sd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfnmaddsd3 ((__v2df)__A, (__v2df)__B, + (__v2df)__C); +} + +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmadd_ss (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfnmaddss3 ((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C); +} + +extern __inline __m128d +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmsub_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfnmsubpd ((__v2df)__A, (__v2df)__B, + (__v2df)__C); +} + +extern __inline __m256d +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fnmsub_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d)__builtin_ia32_vfnmsubpd256 ((__v4df)__A, (__v4df)__B, + (__v4df)__C); +} + +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmsub_ps (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfnmsubps ((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C); +} + +extern __inline __m256 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fnmsub_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256)__builtin_ia32_vfnmsubps256 ((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C); +} + +extern __inline __m128d +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmsub_sd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfnmsubsd3 ((__v2df)__A, (__v2df)__B, + (__v2df)__C); +} + +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fnmsub_ss (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfnmsubss3 ((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C); +} + +extern __inline __m128d +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmaddsub_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfmaddsubpd ((__v2df)__A, (__v2df)__B, + (__v2df)__C); +} + +extern __inline __m256d +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fmaddsub_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d)__builtin_ia32_vfmaddsubpd256 ((__v4df)__A, + (__v4df)__B, + (__v4df)__C); +} + +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmaddsub_ps (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfmaddsubps ((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C); +} + +extern __inline __m256 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fmaddsub_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256)__builtin_ia32_vfmaddsubps256 ((__v8sf)__A, + (__v8sf)__B, + (__v8sf)__C); +} + +extern __inline __m128d +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmsubadd_pd (__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfmaddsubpd ((__v2df)__A, (__v2df)__B, + -(__v2df)__C); +} + +extern __inline __m256d +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fmsubadd_pd (__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d)__builtin_ia32_vfmaddsubpd256 ((__v4df)__A, + (__v4df)__B, + -(__v4df)__C); +} + +extern __inline __m128 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_fmsubadd_ps (__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfmaddsubps ((__v4sf)__A, (__v4sf)__B, + -(__v4sf)__C); +} + +extern __inline __m256 +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_fmsubadd_ps (__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256)__builtin_ia32_vfmaddsubps256 ((__v8sf)__A, + (__v8sf)__B, + -(__v8sf)__C); +} + +#ifdef __DISABLE_FMA__ +#undef __DISABLE_FMA__ +#pragma GCC pop_options +#endif /* __DISABLE_FMA__ */ + +#endif diff --git a/include-gcc/fxsrintrin.h b/include-gcc/fxsrintrin.h new file mode 100644 index 0000000..26506a6 --- /dev/null +++ b/include-gcc/fxsrintrin.h @@ -0,0 +1,73 @@ +/* Copyright (C) 2012-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _FXSRINTRIN_H_INCLUDED +#define _FXSRINTRIN_H_INCLUDED + +#ifndef __FXSR__ +#pragma GCC push_options +#pragma GCC target("fxsr") +#define __DISABLE_FXSR__ +#endif /* __FXSR__ */ + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_fxsave (void *__P) +{ + __builtin_ia32_fxsave (__P); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_fxrstor (void *__P) +{ + __builtin_ia32_fxrstor (__P); +} + +#ifdef __x86_64__ +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_fxsave64 (void *__P) +{ + __builtin_ia32_fxsave64 (__P); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_fxrstor64 (void *__P) +{ + __builtin_ia32_fxrstor64 (__P); +} +#endif + +#ifdef __DISABLE_FXSR__ +#undef __DISABLE_FXSR__ +#pragma GCC pop_options +#endif /* __DISABLE_FXSR__ */ + + +#endif /* _FXSRINTRIN_H_INCLUDED */ diff --git a/include-gcc/gfniintrin.h b/include-gcc/gfniintrin.h new file mode 100644 index 0000000..ef3dc22 --- /dev/null +++ b/include-gcc/gfniintrin.h @@ -0,0 +1,414 @@ +/* Copyright (C) 2017-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _GFNIINTRIN_H_INCLUDED +#define _GFNIINTRIN_H_INCLUDED + +#if !defined(__GFNI__) || !defined(__SSE2__) +#pragma GCC push_options +#pragma GCC target("gfni,sse2") +#define __DISABLE_GFNI__ +#endif /* __GFNI__ */ + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_gf2p8mul_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi((__v16qi) __A, + (__v16qi) __B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_gf2p8affineinv_epi64_epi8 (__m128i __A, __m128i __B, const int __C) +{ + return (__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi ((__v16qi) __A, + (__v16qi) __B, + __C); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_gf2p8affine_epi64_epi8 (__m128i __A, __m128i __B, const int __C) +{ + return (__m128i) __builtin_ia32_vgf2p8affineqb_v16qi ((__v16qi) __A, + (__v16qi) __B, __C); +} +#else +#define _mm_gf2p8affineinv_epi64_epi8(A, B, C) \ + ((__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), (int)(C))) +#define _mm_gf2p8affine_epi64_epi8(A, B, C) \ + ((__m128i) __builtin_ia32_vgf2p8affineqb_v16qi ((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), (int)(C))) +#endif + +#ifdef __DISABLE_GFNI__ +#undef __DISABLE_GFNI__ +#pragma GCC pop_options +#endif /* __DISABLE_GFNI__ */ + +#if !defined(__GFNI__) || !defined(__AVX__) +#pragma GCC push_options +#pragma GCC target("gfni,avx") +#define __DISABLE_GFNIAVX__ +#endif /* __GFNIAVX__ */ + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_gf2p8mul_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi ((__v32qi) __A, + (__v32qi) __B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_gf2p8affineinv_epi64_epi8 (__m256i __A, __m256i __B, const int __C) +{ + return (__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi ((__v32qi) __A, + (__v32qi) __B, + __C); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_gf2p8affine_epi64_epi8 (__m256i __A, __m256i __B, const int __C) +{ + return (__m256i) __builtin_ia32_vgf2p8affineqb_v32qi ((__v32qi) __A, + (__v32qi) __B, __C); +} +#else +#define _mm256_gf2p8affineinv_epi64_epi8(A, B, C) \ + ((__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), \ + (__v32qi)(__m256i)(B), \ + (int)(C))) +#define _mm256_gf2p8affine_epi64_epi8(A, B, C) \ + ((__m256i) __builtin_ia32_vgf2p8affineqb_v32qi ((__v32qi)(__m256i)(A), \ + ( __v32qi)(__m256i)(B), (int)(C))) +#endif + +#ifdef __DISABLE_GFNIAVX__ +#undef __DISABLE_GFNIAVX__ +#pragma GCC pop_options +#endif /* __GFNIAVX__ */ + +#if !defined(__GFNI__) || !defined(__AVX512VL__) +#pragma GCC push_options +#pragma GCC target("gfni,avx512vl") +#define __DISABLE_GFNIAVX512VL__ +#endif /* __GFNIAVX512VL__ */ + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_gf2p8mul_epi8 (__m128i __A, __mmask16 __B, __m128i __C, __m128i __D) +{ + return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi_mask ((__v16qi) __C, + (__v16qi) __D, + (__v16qi)__A, __B); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_gf2p8mul_epi8 (__mmask16 __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi_mask ((__v16qi) __B, + (__v16qi) __C, (__v16qi) _mm_setzero_si128 (), __A); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_gf2p8affineinv_epi64_epi8 (__m128i __A, __mmask16 __B, __m128i __C, + __m128i __D, const int __E) +{ + return (__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi_mask ((__v16qi) __C, + (__v16qi) __D, + __E, + (__v16qi)__A, + __B); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_gf2p8affineinv_epi64_epi8 (__mmask16 __A, __m128i __B, __m128i __C, + const int __D) +{ + return (__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi_mask ((__v16qi) __B, + (__v16qi) __C, __D, + (__v16qi) _mm_setzero_si128 (), + __A); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mask_gf2p8affine_epi64_epi8 (__m128i __A, __mmask16 __B, __m128i __C, + __m128i __D, const int __E) +{ + return (__m128i) __builtin_ia32_vgf2p8affineqb_v16qi_mask ((__v16qi) __C, + (__v16qi) __D, __E, (__v16qi)__A, __B); +} + +extern __inline __m128i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskz_gf2p8affine_epi64_epi8 (__mmask16 __A, __m128i __B, __m128i __C, + const int __D) +{ + return (__m128i) __builtin_ia32_vgf2p8affineqb_v16qi_mask ((__v16qi) __B, + (__v16qi) __C, __D, (__v16qi) _mm_setzero_si128 (), __A); +} +#else +#define _mm_mask_gf2p8affineinv_epi64_epi8(A, B, C, D, E) \ + ((__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi_mask( \ + (__v16qi)(__m128i)(C), (__v16qi)(__m128i)(D), \ + (int)(E), (__v16qi)(__m128i)(A), (__mmask16)(B))) +#define _mm_maskz_gf2p8affineinv_epi64_epi8(A, B, C, D) \ + ((__m128i) __builtin_ia32_vgf2p8affineinvqb_v16qi_mask( \ + (__v16qi)(__m128i)(B), (__v16qi)(__m128i)(C), \ + (int)(D), (__v16qi)(__m128i) _mm_setzero_si128 (), \ + (__mmask16)(A))) +#define _mm_mask_gf2p8affine_epi64_epi8(A, B, C, D, E) \ + ((__m128i) __builtin_ia32_vgf2p8affineqb_v16qi_mask((__v16qi)(__m128i)(C),\ + (__v16qi)(__m128i)(D), (int)(E), (__v16qi)(__m128i)(A), (__mmask16)(B))) +#define _mm_maskz_gf2p8affine_epi64_epi8(A, B, C, D) \ + ((__m128i) __builtin_ia32_vgf2p8affineqb_v16qi_mask((__v16qi)(__m128i)(B),\ + (__v16qi)(__m128i)(C), (int)(D), \ + (__v16qi)(__m128i) _mm_setzero_si128 (), (__mmask16)(A))) +#endif + +#ifdef __DISABLE_GFNIAVX512VL__ +#undef __DISABLE_GFNIAVX512VL__ +#pragma GCC pop_options +#endif /* __GFNIAVX512VL__ */ + +#if !defined(__GFNI__) || !defined(__AVX512VL__) || !defined(__AVX512BW__) +#pragma GCC push_options +#pragma GCC target("gfni,avx512vl,avx512bw") +#define __DISABLE_GFNIAVX512VLBW__ +#endif /* __GFNIAVX512VLBW__ */ + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_gf2p8mul_epi8 (__m256i __A, __mmask32 __B, __m256i __C, + __m256i __D) +{ + return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi_mask ((__v32qi) __C, + (__v32qi) __D, + (__v32qi)__A, __B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_gf2p8mul_epi8 (__mmask32 __A, __m256i __B, __m256i __C) +{ + return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi_mask ((__v32qi) __B, + (__v32qi) __C, (__v32qi) _mm256_setzero_si256 (), __A); +} + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_gf2p8affineinv_epi64_epi8 (__m256i __A, __mmask32 __B, + __m256i __C, __m256i __D, const int __E) +{ + return (__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi_mask ((__v32qi) __C, + (__v32qi) __D, + __E, + (__v32qi)__A, + __B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_gf2p8affineinv_epi64_epi8 (__mmask32 __A, __m256i __B, + __m256i __C, const int __D) +{ + return (__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi_mask ((__v32qi) __B, + (__v32qi) __C, __D, + (__v32qi) _mm256_setzero_si256 (), __A); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_mask_gf2p8affine_epi64_epi8 (__m256i __A, __mmask32 __B, __m256i __C, + __m256i __D, const int __E) +{ + return (__m256i) __builtin_ia32_vgf2p8affineqb_v32qi_mask ((__v32qi) __C, + (__v32qi) __D, + __E, + (__v32qi)__A, + __B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_maskz_gf2p8affine_epi64_epi8 (__mmask32 __A, __m256i __B, + __m256i __C, const int __D) +{ + return (__m256i) __builtin_ia32_vgf2p8affineqb_v32qi_mask ((__v32qi) __B, + (__v32qi) __C, __D, (__v32qi)_mm256_setzero_si256 (), __A); +} +#else +#define _mm256_mask_gf2p8affineinv_epi64_epi8(A, B, C, D, E) \ + ((__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi_mask( \ + (__v32qi)(__m256i)(C), (__v32qi)(__m256i)(D), (int)(E), \ + (__v32qi)(__m256i)(A), (__mmask32)(B))) +#define _mm256_maskz_gf2p8affineinv_epi64_epi8(A, B, C, D) \ + ((__m256i) __builtin_ia32_vgf2p8affineinvqb_v32qi_mask( \ + (__v32qi)(__m256i)(B), (__v32qi)(__m256i)(C), (int)(D), \ + (__v32qi)(__m256i) _mm256_setzero_si256 (), (__mmask32)(A))) +#define _mm256_mask_gf2p8affine_epi64_epi8(A, B, C, D, E) \ + ((__m256i) __builtin_ia32_vgf2p8affineqb_v32qi_mask((__v32qi)(__m256i)(C),\ + (__v32qi)(__m256i)(D), (int)(E), (__v32qi)(__m256i)(A), (__mmask32)(B))) +#define _mm256_maskz_gf2p8affine_epi64_epi8(A, B, C, D) \ + ((__m256i) __builtin_ia32_vgf2p8affineqb_v32qi_mask((__v32qi)(__m256i)(B),\ + (__v32qi)(__m256i)(C), (int)(D), \ + (__v32qi)(__m256i) _mm256_setzero_si256 (), (__mmask32)(A))) +#endif + +#ifdef __DISABLE_GFNIAVX512VLBW__ +#undef __DISABLE_GFNIAVX512VLBW__ +#pragma GCC pop_options +#endif /* __GFNIAVX512VLBW__ */ + +#if !defined(__GFNI__) || !defined(__AVX512F__) || !defined(__AVX512BW__) +#pragma GCC push_options +#pragma GCC target("gfni,avx512f,avx512bw") +#define __DISABLE_GFNIAVX512FBW__ +#endif /* __GFNIAVX512FBW__ */ + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_gf2p8mul_epi8 (__m512i __A, __mmask64 __B, __m512i __C, + __m512i __D) +{ + return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi_mask ((__v64qi) __C, + (__v64qi) __D, (__v64qi)__A, __B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_gf2p8mul_epi8 (__mmask64 __A, __m512i __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi_mask ((__v64qi) __B, + (__v64qi) __C, (__v64qi) _mm512_setzero_si512 (), __A); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_gf2p8mul_epi8 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi ((__v64qi) __A, + (__v64qi) __B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_gf2p8affineinv_epi64_epi8 (__m512i __A, __mmask64 __B, __m512i __C, + __m512i __D, const int __E) +{ + return (__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi_mask ((__v64qi) __C, + (__v64qi) __D, + __E, + (__v64qi)__A, + __B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_gf2p8affineinv_epi64_epi8 (__mmask64 __A, __m512i __B, + __m512i __C, const int __D) +{ + return (__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi_mask ((__v64qi) __B, + (__v64qi) __C, __D, + (__v64qi) _mm512_setzero_si512 (), __A); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_gf2p8affineinv_epi64_epi8 (__m512i __A, __m512i __B, const int __C) +{ + return (__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi ((__v64qi) __A, + (__v64qi) __B, __C); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_mask_gf2p8affine_epi64_epi8 (__m512i __A, __mmask64 __B, __m512i __C, + __m512i __D, const int __E) +{ + return (__m512i) __builtin_ia32_vgf2p8affineqb_v64qi_mask ((__v64qi) __C, + (__v64qi) __D, __E, (__v64qi)__A, __B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_maskz_gf2p8affine_epi64_epi8 (__mmask64 __A, __m512i __B, __m512i __C, + const int __D) +{ + return (__m512i) __builtin_ia32_vgf2p8affineqb_v64qi_mask ((__v64qi) __B, + (__v64qi) __C, __D, (__v64qi) _mm512_setzero_si512 (), __A); +} +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_gf2p8affine_epi64_epi8 (__m512i __A, __m512i __B, const int __C) +{ + return (__m512i) __builtin_ia32_vgf2p8affineqb_v64qi ((__v64qi) __A, + (__v64qi) __B, __C); +} +#else +#define _mm512_mask_gf2p8affineinv_epi64_epi8(A, B, C, D, E) \ + ((__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi_mask( \ + (__v64qi)(__m512i)(C), (__v64qi)(__m512i)(D), (int)(E), \ + (__v64qi)(__m512i)(A), (__mmask64)(B))) +#define _mm512_maskz_gf2p8affineinv_epi64_epi8(A, B, C, D) \ + ((__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi_mask( \ + (__v64qi)(__m512i)(B), (__v64qi)(__m512i)(C), (int)(D), \ + (__v64qi)(__m512i) _mm512_setzero_si512 (), (__mmask64)(A))) +#define _mm512_gf2p8affineinv_epi64_epi8(A, B, C) \ + ((__m512i) __builtin_ia32_vgf2p8affineinvqb_v64qi ( \ + (__v64qi)(__m512i)(A), (__v64qi)(__m512i)(B), (int)(C))) +#define _mm512_mask_gf2p8affine_epi64_epi8(A, B, C, D, E) \ + ((__m512i) __builtin_ia32_vgf2p8affineqb_v64qi_mask((__v64qi)(__m512i)(C),\ + (__v64qi)(__m512i)(D), (int)(E), (__v64qi)(__m512i)(A), (__mmask64)(B))) +#define _mm512_maskz_gf2p8affine_epi64_epi8(A, B, C, D) \ + ((__m512i) __builtin_ia32_vgf2p8affineqb_v64qi_mask((__v64qi)(__m512i)(B),\ + (__v64qi)(__m512i)(C), (int)(D), \ + (__v64qi)(__m512i) _mm512_setzero_si512 (), (__mmask64)(A))) +#define _mm512_gf2p8affine_epi64_epi8(A, B, C) \ + ((__m512i) __builtin_ia32_vgf2p8affineqb_v64qi ((__v64qi)(__m512i)(A), \ + (__v64qi)(__m512i)(B), (int)(C))) +#endif + +#ifdef __DISABLE_GFNIAVX512FBW__ +#undef __DISABLE_GFNIAVX512FBW__ +#pragma GCC pop_options +#endif /* __GFNIAVX512FBW__ */ + +#endif /* _GFNIINTRIN_H_INCLUDED */ diff --git a/include-gcc/hresetintrin.h b/include-gcc/hresetintrin.h new file mode 100644 index 0000000..7a29665 --- /dev/null +++ b/include-gcc/hresetintrin.h @@ -0,0 +1,48 @@ +/* Copyright (C) 2020-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#if !defined _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _HRESETINTRIN_H_INCLUDED +#define _HRESETINTRIN_H_INCLUDED + +#ifndef __HRESET__ +#pragma GCC push_options +#pragma GCC target ("hreset") +#define __DISABLE_HRESET__ +#endif /* __HRESET__ */ + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_hreset (unsigned int __EAX) +{ + __builtin_ia32_hreset (__EAX); +} + +#ifdef __DISABLE_HRESET__ +#undef __DISABLE_HRESET__ +#pragma GCC pop_options +#endif /* __DISABLE_HRESET__ */ +#endif /* _HRESETINTRIN_H_INCLUDED. */ diff --git a/include-gcc/ia32intrin.h b/include-gcc/ia32intrin.h new file mode 100644 index 0000000..25b19bd --- /dev/null +++ b/include-gcc/ia32intrin.h @@ -0,0 +1,317 @@ +/* Copyright (C) 2009-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +/* 32bit bsf */ +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__bsfd (int __X) +{ + return __builtin_ctz (__X); +} + +/* 32bit bsr */ +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__bsrd (int __X) +{ + return __builtin_ia32_bsrsi (__X); +} + +/* 32bit bswap */ +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__bswapd (int __X) +{ + return __builtin_bswap32 (__X); +} + +#ifndef __iamcu__ + +#ifndef __CRC32__ +#pragma GCC push_options +#pragma GCC target("crc32") +#define __DISABLE_CRC32__ +#endif /* __CRC32__ */ + +/* 32bit accumulate CRC32 (polynomial 0x11EDC6F41) value. */ +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__crc32b (unsigned int __C, unsigned char __V) +{ + return __builtin_ia32_crc32qi (__C, __V); +} + +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__crc32w (unsigned int __C, unsigned short __V) +{ + return __builtin_ia32_crc32hi (__C, __V); +} + +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__crc32d (unsigned int __C, unsigned int __V) +{ + return __builtin_ia32_crc32si (__C, __V); +} + +#ifdef __DISABLE_CRC32__ +#undef __DISABLE_CRC32__ +#pragma GCC pop_options +#endif /* __DISABLE_CRC32__ */ + +#endif /* __iamcu__ */ + +/* 32bit popcnt */ +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__popcntd (unsigned int __X) +{ + return __builtin_popcount (__X); +} + +#ifndef __iamcu__ + +/* rdpmc */ +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rdpmc (int __S) +{ + return __builtin_ia32_rdpmc (__S); +} + +#endif /* __iamcu__ */ + +/* rdtsc */ +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rdtsc (void) +{ + return __builtin_ia32_rdtsc (); +} + +#ifndef __iamcu__ + +/* rdtscp */ +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rdtscp (unsigned int *__A) +{ + return __builtin_ia32_rdtscp (__A); +} + +#endif /* __iamcu__ */ + +/* 8bit rol */ +extern __inline unsigned char +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rolb (unsigned char __X, int __C) +{ + return __builtin_ia32_rolqi (__X, __C); +} + +/* 16bit rol */ +extern __inline unsigned short +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rolw (unsigned short __X, int __C) +{ + return __builtin_ia32_rolhi (__X, __C); +} + +/* 32bit rol */ +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rold (unsigned int __X, int __C) +{ + __C &= 31; + return (__X << __C) | (__X >> (-__C & 31)); +} + +/* 8bit ror */ +extern __inline unsigned char +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rorb (unsigned char __X, int __C) +{ + return __builtin_ia32_rorqi (__X, __C); +} + +/* 16bit ror */ +extern __inline unsigned short +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rorw (unsigned short __X, int __C) +{ + return __builtin_ia32_rorhi (__X, __C); +} + +/* 32bit ror */ +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rord (unsigned int __X, int __C) +{ + __C &= 31; + return (__X >> __C) | (__X << (-__C & 31)); +} + +/* Pause */ +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__pause (void) +{ + __builtin_ia32_pause (); +} + +#ifdef __x86_64__ +/* 64bit bsf */ +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__bsfq (long long __X) +{ + return __builtin_ctzll (__X); +} + +/* 64bit bsr */ +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__bsrq (long long __X) +{ + return __builtin_ia32_bsrdi (__X); +} + +/* 64bit bswap */ +extern __inline long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__bswapq (long long __X) +{ + return __builtin_bswap64 (__X); +} + +#ifndef __CRC32__ +#pragma GCC push_options +#pragma GCC target("crc32") +#define __DISABLE_CRC32__ +#endif /* __CRC32__ */ + +/* 64bit accumulate CRC32 (polynomial 0x11EDC6F41) value. */ +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__crc32q (unsigned long long __C, unsigned long long __V) +{ + return __builtin_ia32_crc32di (__C, __V); +} + +#ifdef __DISABLE_CRC32__ +#undef __DISABLE_CRC32__ +#pragma GCC pop_options +#endif /* __DISABLE_CRC32__ */ + +/* 64bit popcnt */ +extern __inline long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__popcntq (unsigned long long __X) +{ + return __builtin_popcountll (__X); +} + +/* 64bit rol */ +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rolq (unsigned long long __X, int __C) +{ + __C &= 63; + return (__X << __C) | (__X >> (-__C & 63)); +} + +/* 64bit ror */ +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__rorq (unsigned long long __X, int __C) +{ + __C &= 63; + return (__X >> __C) | (__X << (-__C & 63)); +} + +/* Read flags register */ +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__readeflags (void) +{ + return __builtin_ia32_readeflags_u64 (); +} + +/* Write flags register */ +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__writeeflags (unsigned long long __X) +{ + __builtin_ia32_writeeflags_u64 (__X); +} + +#define _bswap64(a) __bswapq(a) +#define _popcnt64(a) __popcntq(a) +#else + +/* Read flags register */ +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__readeflags (void) +{ + return __builtin_ia32_readeflags_u32 (); +} + +/* Write flags register */ +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__writeeflags (unsigned int __X) +{ + __builtin_ia32_writeeflags_u32 (__X); +} + +#endif + +/* On LP64 systems, longs are 64-bit. Use the appropriate rotate + * function. */ +#ifdef __LP64__ +#define _lrotl(a,b) __rolq((a), (b)) +#define _lrotr(a,b) __rorq((a), (b)) +#else +#define _lrotl(a,b) __rold((a), (b)) +#define _lrotr(a,b) __rord((a), (b)) +#endif + +#define _bit_scan_forward(a) __bsfd(a) +#define _bit_scan_reverse(a) __bsrd(a) +#define _bswap(a) __bswapd(a) +#define _popcnt32(a) __popcntd(a) +#ifndef __iamcu__ +#define _rdpmc(a) __rdpmc(a) +#define _rdtscp(a) __rdtscp(a) +#endif /* __iamcu__ */ +#define _rdtsc() __rdtsc() +#define _rotwl(a,b) __rolw((a), (b)) +#define _rotwr(a,b) __rorw((a), (b)) +#define _rotl(a,b) __rold((a), (b)) +#define _rotr(a,b) __rord((a), (b)) diff --git a/include-gcc/immintrin.h b/include-gcc/immintrin.h new file mode 100644 index 0000000..b220d87 --- /dev/null +++ b/include-gcc/immintrin.h @@ -0,0 +1,143 @@ +/* Copyright (C) 2008-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#define _IMMINTRIN_H_INCLUDED + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#ifdef __SSE2__ +#include + +#include +#endif + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#ifdef __SSE2__ +#include + +#include + +#include +#endif + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#endif /* _IMMINTRIN_H_INCLUDED */ diff --git a/include-gcc/keylockerintrin.h b/include-gcc/keylockerintrin.h new file mode 100644 index 0000000..09c4712 --- /dev/null +++ b/include-gcc/keylockerintrin.h @@ -0,0 +1,129 @@ +/* Copyright (C) 2018-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#if !defined _IMMINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _KEYLOCKERINTRIN_H_INCLUDED +#define _KEYLOCKERINTRIN_H_INCLUDED + +#ifndef __KL__ +#pragma GCC push_options +#pragma GCC target("kl") +#define __DISABLE_KL__ +#endif /* __KL__ */ + + +extern __inline +void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadiwkey (unsigned int __I, __m128i __A, __m128i __B, __m128i __C) +{ + __builtin_ia32_loadiwkey ((__v2di) __B, (__v2di) __C, (__v2di) __A, __I); +} + +extern __inline +unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_encodekey128_u32 (unsigned int __I, __m128i __A, void * __P) +{ + return __builtin_ia32_encodekey128_u32 (__I, (__v2di)__A, __P); +} + +extern __inline +unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_encodekey256_u32 (unsigned int __I, __m128i __A, __m128i __B, void * __P) +{ + return __builtin_ia32_encodekey256_u32 (__I, (__v2di)__A, (__v2di)__B, __P); +} + +extern __inline +unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aesdec128kl_u8 (__m128i * __A, __m128i __B, const void * __P) +{ + return __builtin_ia32_aesdec128kl_u8 ((__v2di *) __A, (__v2di) __B, __P); +} + +extern __inline +unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aesdec256kl_u8 (__m128i * __A, __m128i __B, const void * __P) +{ + return __builtin_ia32_aesdec256kl_u8 ((__v2di *) __A, (__v2di) __B, __P); +} + +extern __inline +unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aesenc128kl_u8 (__m128i * __A, __m128i __B, const void * __P) +{ + return __builtin_ia32_aesenc128kl_u8 ((__v2di *) __A, (__v2di) __B, __P); +} + +extern __inline +unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aesenc256kl_u8 (__m128i * __A, __m128i __B, const void * __P) +{ + return __builtin_ia32_aesenc256kl_u8 ((__v2di *) __A, (__v2di) __B, __P); +} + +#ifdef __DISABLE_KL__ +#undef __DISABLE_KL__ +#pragma GCC pop_options +#endif /* __DISABLE_KL__ */ + +#ifndef __WIDEKL__ +#pragma GCC push_options +#pragma GCC target("widekl") +#define __DISABLE_WIDEKL__ +#endif /* __WIDEKL__ */ + +extern __inline +unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aesdecwide128kl_u8(__m128i __A[8], const __m128i __B[8], const void * __P) +{ + return __builtin_ia32_aesdecwide128kl_u8 ((__v2di *) __A, (__v2di *) __B, __P); +} + +extern __inline +unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aesdecwide256kl_u8(__m128i __A[8], const __m128i __B[8], const void * __P) +{ + return __builtin_ia32_aesdecwide256kl_u8 ((__v2di *) __A, (__v2di *) __B, __P); +} + +extern __inline +unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aesencwide128kl_u8(__m128i __A[8], const __m128i __B[8], const void * __P) +{ + return __builtin_ia32_aesencwide128kl_u8 ((__v2di *) __A, (__v2di *) __B, __P); +} + +extern __inline +unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aesencwide256kl_u8(__m128i __A[8], const __m128i __B[8], const void * __P) +{ + return __builtin_ia32_aesencwide256kl_u8 ((__v2di *) __A, (__v2di *) __B, __P); +} +#ifdef __DISABLE_WIDEKL__ +#undef __DISABLE_WIDEKL__ +#pragma GCC pop_options +#endif /* __DISABLE_WIDEKL__ */ +#endif /* _KEYLOCKERINTRIN_H_INCLUDED */ diff --git a/include-gcc/lwpintrin.h b/include-gcc/lwpintrin.h new file mode 100644 index 0000000..b3e9e1d --- /dev/null +++ b/include-gcc/lwpintrin.h @@ -0,0 +1,107 @@ +/* Copyright (C) 2007-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _LWPINTRIN_H_INCLUDED +#define _LWPINTRIN_H_INCLUDED + +#ifndef __LWP__ +#pragma GCC push_options +#pragma GCC target("lwp") +#define __DISABLE_LWP__ +#endif /* __LWP__ */ + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__llwpcb (void *__pcbAddress) +{ + __builtin_ia32_llwpcb (__pcbAddress); +} + +extern __inline void * __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__slwpcb (void) +{ + return __builtin_ia32_slwpcb (); +} + +#ifdef __OPTIMIZE__ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__lwpval32 (unsigned int __data2, unsigned int __data1, unsigned int __flags) +{ + __builtin_ia32_lwpval32 (__data2, __data1, __flags); +} + +#ifdef __x86_64__ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__lwpval64 (unsigned long long __data2, unsigned int __data1, + unsigned int __flags) +{ + __builtin_ia32_lwpval64 (__data2, __data1, __flags); +} +#endif +#else +#define __lwpval32(D2, D1, F) \ + (__builtin_ia32_lwpval32 ((unsigned int) (D2), (unsigned int) (D1), \ + (unsigned int) (F))) +#ifdef __x86_64__ +#define __lwpval64(D2, D1, F) \ + (__builtin_ia32_lwpval64 ((unsigned long long) (D2), (unsigned int) (D1), \ + (unsigned int) (F))) +#endif +#endif + + +#ifdef __OPTIMIZE__ +extern __inline unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__lwpins32 (unsigned int __data2, unsigned int __data1, unsigned int __flags) +{ + return __builtin_ia32_lwpins32 (__data2, __data1, __flags); +} + +#ifdef __x86_64__ +extern __inline unsigned char __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__lwpins64 (unsigned long long __data2, unsigned int __data1, + unsigned int __flags) +{ + return __builtin_ia32_lwpins64 (__data2, __data1, __flags); +} +#endif +#else +#define __lwpins32(D2, D1, F) \ + (__builtin_ia32_lwpins32 ((unsigned int) (D2), (unsigned int) (D1), \ + (unsigned int) (F))) +#ifdef __x86_64__ +#define __lwpins64(D2, D1, F) \ + (__builtin_ia32_lwpins64 ((unsigned long long) (D2), (unsigned int) (D1), \ + (unsigned int) (F))) +#endif +#endif + +#ifdef __DISABLE_LWP__ +#undef __DISABLE_LWP__ +#pragma GCC pop_options +#endif /* __DISABLE_LWP__ */ + +#endif /* _LWPINTRIN_H_INCLUDED */ diff --git a/include-gcc/lzcntintrin.h b/include-gcc/lzcntintrin.h new file mode 100644 index 0000000..4d81985 --- /dev/null +++ b/include-gcc/lzcntintrin.h @@ -0,0 +1,75 @@ +/* Copyright (C) 2009-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + + +#ifndef _LZCNTINTRIN_H_INCLUDED +#define _LZCNTINTRIN_H_INCLUDED + +#ifndef __LZCNT__ +#pragma GCC push_options +#pragma GCC target("lzcnt") +#define __DISABLE_LZCNT__ +#endif /* __LZCNT__ */ + +extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__lzcnt16 (unsigned short __X) +{ + return __builtin_ia32_lzcnt_u16 (__X); +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__lzcnt32 (unsigned int __X) +{ + return __builtin_ia32_lzcnt_u32 (__X); +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_lzcnt_u32 (unsigned int __X) +{ + return __builtin_ia32_lzcnt_u32 (__X); +} + +#ifdef __x86_64__ +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__lzcnt64 (unsigned long long __X) +{ + return __builtin_ia32_lzcnt_u64 (__X); +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_lzcnt_u64 (unsigned long long __X) +{ + return __builtin_ia32_lzcnt_u64 (__X); +} +#endif + +#ifdef __DISABLE_LZCNT__ +#undef __DISABLE_LZCNT__ +#pragma GCC pop_options +#endif /* __DISABLE_LZCNT__ */ + +#endif /* _LZCNTINTRIN_H_INCLUDED */ diff --git a/include-gcc/mm3dnow.h b/include-gcc/mm3dnow.h new file mode 100644 index 0000000..f8ef374 --- /dev/null +++ b/include-gcc/mm3dnow.h @@ -0,0 +1,233 @@ +/* Copyright (C) 2004-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* Implemented from the mm3dnow.h (of supposedly AMD origin) included with + MSVC 7.1. */ + +#ifndef _MM3DNOW_H_INCLUDED +#define _MM3DNOW_H_INCLUDED + +#include +#include + +#if defined __x86_64__ && !defined __SSE__ || !defined __3dNOW__ +#pragma GCC push_options +#ifdef __x86_64__ +#pragma GCC target("sse,3dnow") +#else +#pragma GCC target("3dnow") +#endif +#define __DISABLE_3dNOW__ +#endif /* __3dNOW__ */ + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_femms (void) +{ + __builtin_ia32_femms(); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pavgusb (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pavgusb ((__v8qi)__A, (__v8qi)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pf2id (__m64 __A) +{ + return (__m64)__builtin_ia32_pf2id ((__v2sf)__A); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfacc (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfacc ((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfadd (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfadd ((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfcmpeq (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfcmpeq ((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfcmpge (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfcmpge ((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfcmpgt (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfcmpgt ((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfmax (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfmax ((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfmin (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfmin ((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfmul (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfmul ((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfrcp (__m64 __A) +{ + return (__m64)__builtin_ia32_pfrcp ((__v2sf)__A); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfrcpit1 (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfrcpit1 ((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfrcpit2 (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfrcpit2 ((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfrsqrt (__m64 __A) +{ + return (__m64)__builtin_ia32_pfrsqrt ((__v2sf)__A); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfrsqit1 (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfrsqit1 ((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfsub (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfsub ((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfsubr (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfsubr ((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pi2fd (__m64 __A) +{ + return (__m64)__builtin_ia32_pi2fd ((__v2si)__A); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pmulhrw (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pmulhrw ((__v4hi)__A, (__v4hi)__B); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_prefetch (void *__P) +{ + __builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_from_float (float __A) +{ + return __extension__ (__m64)(__v2sf){ __A, 0.0f }; +} + +extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_to_float (__m64 __A) +{ + union { __v2sf v; float a[2]; } __tmp; + __tmp.v = (__v2sf)__A; + return __tmp.a[0]; +} + +#ifdef __DISABLE_3dNOW__ +#undef __DISABLE_3dNOW__ +#pragma GCC pop_options +#endif /* __DISABLE_3dNOW__ */ + +#if defined __x86_64__ && !defined __SSE__ || !defined __3dNOW_A__ +#pragma GCC push_options +#ifdef __x86_64__ +#pragma GCC target("sse,3dnowa") +#else +#pragma GCC target("3dnowa") +#endif +#define __DISABLE_3dNOW_A__ +#endif /* __3dNOW_A__ */ + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pf2iw (__m64 __A) +{ + return (__m64)__builtin_ia32_pf2iw ((__v2sf)__A); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfnacc (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfnacc ((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pfpnacc (__m64 __A, __m64 __B) +{ + return (__m64)__builtin_ia32_pfpnacc ((__v2sf)__A, (__v2sf)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pi2fw (__m64 __A) +{ + return (__m64)__builtin_ia32_pi2fw ((__v2si)__A); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pswapd (__m64 __A) +{ + return (__m64)__builtin_ia32_pswapdsf ((__v2sf)__A); +} + +#ifdef __DISABLE_3dNOW_A__ +#undef __DISABLE_3dNOW_A__ +#pragma GCC pop_options +#endif /* __DISABLE_3dNOW_A__ */ + +#endif /* _MM3DNOW_H_INCLUDED */ diff --git a/include-gcc/mm_malloc.h b/include-gcc/mm_malloc.h new file mode 100644 index 0000000..3527283 --- /dev/null +++ b/include-gcc/mm_malloc.h @@ -0,0 +1,57 @@ +/* Copyright (C) 2004-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _MM_MALLOC_H_INCLUDED +#define _MM_MALLOC_H_INCLUDED + +#include + +/* We can't depend on since the prototype of posix_memalign + may not be visible. */ +#ifndef __cplusplus +extern int posix_memalign (void **, size_t, size_t); +#else +extern "C" int posix_memalign (void **, size_t, size_t) throw (); +#endif + +static __inline void * +_mm_malloc (size_t __size, size_t __alignment) +{ + void *__ptr; + if (__alignment == 1) + return malloc (__size); + if (__alignment == 2 || (sizeof (void *) == 8 && __alignment == 4)) + __alignment = sizeof (void *); + if (posix_memalign (&__ptr, __alignment, __size) == 0) + return __ptr; + else + return NULL; +} + +static __inline void +_mm_free (void *__ptr) +{ + free (__ptr); +} + +#endif /* _MM_MALLOC_H_INCLUDED */ diff --git a/include-gcc/mmintrin.h b/include-gcc/mmintrin.h new file mode 100644 index 0000000..fbac9c3 --- /dev/null +++ b/include-gcc/mmintrin.h @@ -0,0 +1,965 @@ +/* Copyright (C) 2002-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* Implemented from the specification included in the Intel C++ Compiler + User Guide and Reference, version 9.0. */ + +#ifndef _MMINTRIN_H_INCLUDED +#define _MMINTRIN_H_INCLUDED + +#if defined __x86_64__ && !defined __SSE__ || !defined __MMX__ +#pragma GCC push_options +#ifdef __MMX_WITH_SSE__ +#pragma GCC target("sse2") +#elif defined __x86_64__ +#pragma GCC target("sse,mmx") +#else +#pragma GCC target("mmx") +#endif +#define __DISABLE_MMX__ +#endif /* __MMX__ */ + +/* The Intel API is flexible enough that we must allow aliasing with other + vector types, and their scalar components. */ +typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__)); +typedef int __m32 __attribute__ ((__vector_size__ (4), __may_alias__)); +typedef short __m16 __attribute__ ((__vector_size__ (2), __may_alias__)); + +/* Unaligned version of the same type */ +typedef int __m64_u __attribute__ ((__vector_size__ (8), __may_alias__, __aligned__ (1))); +typedef int __m32_u __attribute__ ((__vector_size__ (4), \ + __may_alias__, __aligned__ (1))); +typedef short __m16_u __attribute__ ((__vector_size__ (2), \ + __may_alias__, __aligned__ (1))); + +/* Internal data types for implementing the intrinsics. */ +typedef int __v2si __attribute__ ((__vector_size__ (8))); +typedef short __v4hi __attribute__ ((__vector_size__ (8))); +typedef char __v8qi __attribute__ ((__vector_size__ (8))); +typedef long long __v1di __attribute__ ((__vector_size__ (8))); +typedef float __v2sf __attribute__ ((__vector_size__ (8))); + +/* Empty the multimedia state. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_empty (void) +{ + __builtin_ia32_emms (); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_empty (void) +{ + _mm_empty (); +} + +/* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi32_si64 (int __i) +{ + return (__m64) __builtin_ia32_vec_init_v2si (__i, 0); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_from_int (int __i) +{ + return _mm_cvtsi32_si64 (__i); +} + +#ifdef __x86_64__ +/* Convert I to a __m64 object. */ + +/* Intel intrinsic. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_from_int64 (long long __i) +{ + return (__m64) __i; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64_m64 (long long __i) +{ + return (__m64) __i; +} + +/* Microsoft intrinsic. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64x_si64 (long long __i) +{ + return (__m64) __i; +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_pi64x (long long __i) +{ + return (__m64) __i; +} +#endif + +/* Convert the lower 32 bits of the __m64 object into an integer. */ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64_si32 (__m64 __i) +{ + return __builtin_ia32_vec_ext_v2si ((__v2si)__i, 0); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_to_int (__m64 __i) +{ + return _mm_cvtsi64_si32 (__i); +} + +#ifdef __x86_64__ +/* Convert the __m64 object to a 64bit integer. */ + +/* Intel intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_to_int64 (__m64 __i) +{ + return (long long)__i; +} + +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtm64_si64 (__m64 __i) +{ + return (long long)__i; +} + +/* Microsoft intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64_si64x (__m64 __i) +{ + return (long long)__i; +} +#endif + +/* Pack the four 16-bit values from M1 into the lower four 8-bit values of + the result, and the four 16-bit values from M2 into the upper four 8-bit + values of the result, all with signed saturation. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packs_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_packsswb ((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_packsswb (__m64 __m1, __m64 __m2) +{ + return _mm_packs_pi16 (__m1, __m2); +} + +/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of + the result, and the two 32-bit values from M2 into the upper two 16-bit + values of the result, all with signed saturation. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packs_pi32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_packssdw ((__v2si)__m1, (__v2si)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_packssdw (__m64 __m1, __m64 __m2) +{ + return _mm_packs_pi32 (__m1, __m2); +} + +/* Pack the four 16-bit values from M1 into the lower four 8-bit values of + the result, and the four 16-bit values from M2 into the upper four 8-bit + values of the result, all with unsigned saturation. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packs_pu16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_packuswb ((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_packuswb (__m64 __m1, __m64 __m2) +{ + return _mm_packs_pu16 (__m1, __m2); +} + +/* Interleave the four 8-bit values from the high half of M1 with the four + 8-bit values from the high half of M2. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_pi8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_punpckhbw ((__v8qi)__m1, (__v8qi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_punpckhbw (__m64 __m1, __m64 __m2) +{ + return _mm_unpackhi_pi8 (__m1, __m2); +} + +/* Interleave the two 16-bit values from the high half of M1 with the two + 16-bit values from the high half of M2. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_punpckhwd ((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_punpckhwd (__m64 __m1, __m64 __m2) +{ + return _mm_unpackhi_pi16 (__m1, __m2); +} + +/* Interleave the 32-bit value from the high half of M1 with the 32-bit + value from the high half of M2. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_pi32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_punpckhdq ((__v2si)__m1, (__v2si)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_punpckhdq (__m64 __m1, __m64 __m2) +{ + return _mm_unpackhi_pi32 (__m1, __m2); +} + +/* Interleave the four 8-bit values from the low half of M1 with the four + 8-bit values from the low half of M2. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_pi8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_punpcklbw ((__v8qi)__m1, (__v8qi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_punpcklbw (__m64 __m1, __m64 __m2) +{ + return _mm_unpacklo_pi8 (__m1, __m2); +} + +/* Interleave the two 16-bit values from the low half of M1 with the two + 16-bit values from the low half of M2. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_punpcklwd ((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_punpcklwd (__m64 __m1, __m64 __m2) +{ + return _mm_unpacklo_pi16 (__m1, __m2); +} + +/* Interleave the 32-bit value from the low half of M1 with the 32-bit + value from the low half of M2. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_pi32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_punpckldq ((__v2si)__m1, (__v2si)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_punpckldq (__m64 __m1, __m64 __m2) +{ + return _mm_unpacklo_pi32 (__m1, __m2); +} + +/* Add the 8-bit values in M1 to the 8-bit values in M2. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_pi8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_paddb ((__v8qi)__m1, (__v8qi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_paddb (__m64 __m1, __m64 __m2) +{ + return _mm_add_pi8 (__m1, __m2); +} + +/* Add the 16-bit values in M1 to the 16-bit values in M2. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_paddw ((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_paddw (__m64 __m1, __m64 __m2) +{ + return _mm_add_pi16 (__m1, __m2); +} + +/* Add the 32-bit values in M1 to the 32-bit values in M2. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_pi32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_paddd ((__v2si)__m1, (__v2si)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_paddd (__m64 __m1, __m64 __m2) +{ + return _mm_add_pi32 (__m1, __m2); +} + +/* Add the 64-bit values in M1 to the 64-bit values in M2. */ +#ifndef __SSE2__ +#pragma GCC push_options +#ifdef __MMX_WITH_SSE__ +#pragma GCC target("sse2") +#else +#pragma GCC target("sse2,mmx") +#endif +#define __DISABLE_SSE2__ +#endif /* __SSE2__ */ + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_si64 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_paddq ((__v1di)__m1, (__v1di)__m2); +} +#ifdef __DISABLE_SSE2__ +#undef __DISABLE_SSE2__ +#pragma GCC pop_options +#endif /* __DISABLE_SSE2__ */ + +/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed + saturated arithmetic. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_pi8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_paddsb ((__v8qi)__m1, (__v8qi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_paddsb (__m64 __m1, __m64 __m2) +{ + return _mm_adds_pi8 (__m1, __m2); +} + +/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed + saturated arithmetic. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_paddsw ((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_paddsw (__m64 __m1, __m64 __m2) +{ + return _mm_adds_pi16 (__m1, __m2); +} + +/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned + saturated arithmetic. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_pu8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_paddusb ((__v8qi)__m1, (__v8qi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_paddusb (__m64 __m1, __m64 __m2) +{ + return _mm_adds_pu8 (__m1, __m2); +} + +/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned + saturated arithmetic. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_adds_pu16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_paddusw ((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_paddusw (__m64 __m1, __m64 __m2) +{ + return _mm_adds_pu16 (__m1, __m2); +} + +/* Subtract the 8-bit values in M2 from the 8-bit values in M1. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_pi8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_psubb ((__v8qi)__m1, (__v8qi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psubb (__m64 __m1, __m64 __m2) +{ + return _mm_sub_pi8 (__m1, __m2); +} + +/* Subtract the 16-bit values in M2 from the 16-bit values in M1. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_psubw ((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psubw (__m64 __m1, __m64 __m2) +{ + return _mm_sub_pi16 (__m1, __m2); +} + +/* Subtract the 32-bit values in M2 from the 32-bit values in M1. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_pi32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_psubd ((__v2si)__m1, (__v2si)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psubd (__m64 __m1, __m64 __m2) +{ + return _mm_sub_pi32 (__m1, __m2); +} + +/* Add the 64-bit values in M1 to the 64-bit values in M2. */ +#ifndef __SSE2__ +#pragma GCC push_options +#ifdef __MMX_WITH_SSE__ +#pragma GCC target("sse2") +#else +#pragma GCC target("sse2,mmx") +#endif +#define __DISABLE_SSE2__ +#endif /* __SSE2__ */ + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_si64 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_psubq ((__v1di)__m1, (__v1di)__m2); +} +#ifdef __DISABLE_SSE2__ +#undef __DISABLE_SSE2__ +#pragma GCC pop_options +#endif /* __DISABLE_SSE2__ */ + +/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed + saturating arithmetic. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_subs_pi8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_psubsb ((__v8qi)__m1, (__v8qi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psubsb (__m64 __m1, __m64 __m2) +{ + return _mm_subs_pi8 (__m1, __m2); +} + +/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using + signed saturating arithmetic. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_subs_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_psubsw ((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psubsw (__m64 __m1, __m64 __m2) +{ + return _mm_subs_pi16 (__m1, __m2); +} + +/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using + unsigned saturating arithmetic. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_subs_pu8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_psubusb ((__v8qi)__m1, (__v8qi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psubusb (__m64 __m1, __m64 __m2) +{ + return _mm_subs_pu8 (__m1, __m2); +} + +/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using + unsigned saturating arithmetic. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_subs_pu16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_psubusw ((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psubusw (__m64 __m1, __m64 __m2) +{ + return _mm_subs_pu16 (__m1, __m2); +} + +/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing + four 32-bit intermediate results, which are then summed by pairs to + produce two 32-bit results. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_madd_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_pmaddwd ((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pmaddwd (__m64 __m1, __m64 __m2) +{ + return _mm_madd_pi16 (__m1, __m2); +} + +/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in + M2 and produce the high 16 bits of the 32-bit results. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mulhi_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_pmulhw ((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pmulhw (__m64 __m1, __m64 __m2) +{ + return _mm_mulhi_pi16 (__m1, __m2); +} + +/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce + the low 16 bits of the results. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mullo_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_pmullw ((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pmullw (__m64 __m1, __m64 __m2) +{ + return _mm_mullo_pi16 (__m1, __m2); +} + +/* Shift four 16-bit values in M left by COUNT. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sll_pi16 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_ia32_psllw ((__v4hi)__m, (__v4hi)__count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psllw (__m64 __m, __m64 __count) +{ + return _mm_sll_pi16 (__m, __count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_slli_pi16 (__m64 __m, int __count) +{ + return (__m64) __builtin_ia32_psllwi ((__v4hi)__m, __count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psllwi (__m64 __m, int __count) +{ + return _mm_slli_pi16 (__m, __count); +} + +/* Shift two 32-bit values in M left by COUNT. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sll_pi32 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_ia32_pslld ((__v2si)__m, (__v2si)__count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pslld (__m64 __m, __m64 __count) +{ + return _mm_sll_pi32 (__m, __count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_slli_pi32 (__m64 __m, int __count) +{ + return (__m64) __builtin_ia32_pslldi ((__v2si)__m, __count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pslldi (__m64 __m, int __count) +{ + return _mm_slli_pi32 (__m, __count); +} + +/* Shift the 64-bit value in M left by COUNT. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sll_si64 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_ia32_psllq ((__v1di)__m, (__v1di)__count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psllq (__m64 __m, __m64 __count) +{ + return _mm_sll_si64 (__m, __count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_slli_si64 (__m64 __m, int __count) +{ + return (__m64) __builtin_ia32_psllqi ((__v1di)__m, __count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psllqi (__m64 __m, int __count) +{ + return _mm_slli_si64 (__m, __count); +} + +/* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sra_pi16 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_ia32_psraw ((__v4hi)__m, (__v4hi)__count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psraw (__m64 __m, __m64 __count) +{ + return _mm_sra_pi16 (__m, __count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srai_pi16 (__m64 __m, int __count) +{ + return (__m64) __builtin_ia32_psrawi ((__v4hi)__m, __count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psrawi (__m64 __m, int __count) +{ + return _mm_srai_pi16 (__m, __count); +} + +/* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sra_pi32 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_ia32_psrad ((__v2si)__m, (__v2si)__count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psrad (__m64 __m, __m64 __count) +{ + return _mm_sra_pi32 (__m, __count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srai_pi32 (__m64 __m, int __count) +{ + return (__m64) __builtin_ia32_psradi ((__v2si)__m, __count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psradi (__m64 __m, int __count) +{ + return _mm_srai_pi32 (__m, __count); +} + +/* Shift four 16-bit values in M right by COUNT; shift in zeros. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srl_pi16 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_ia32_psrlw ((__v4hi)__m, (__v4hi)__count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psrlw (__m64 __m, __m64 __count) +{ + return _mm_srl_pi16 (__m, __count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_pi16 (__m64 __m, int __count) +{ + return (__m64) __builtin_ia32_psrlwi ((__v4hi)__m, __count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psrlwi (__m64 __m, int __count) +{ + return _mm_srli_pi16 (__m, __count); +} + +/* Shift two 32-bit values in M right by COUNT; shift in zeros. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srl_pi32 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_ia32_psrld ((__v2si)__m, (__v2si)__count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psrld (__m64 __m, __m64 __count) +{ + return _mm_srl_pi32 (__m, __count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_pi32 (__m64 __m, int __count) +{ + return (__m64) __builtin_ia32_psrldi ((__v2si)__m, __count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psrldi (__m64 __m, int __count) +{ + return _mm_srli_pi32 (__m, __count); +} + +/* Shift the 64-bit value in M left by COUNT; shift in zeros. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srl_si64 (__m64 __m, __m64 __count) +{ + return (__m64) __builtin_ia32_psrlq ((__v1di)__m, (__v1di)__count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psrlq (__m64 __m, __m64 __count) +{ + return _mm_srl_si64 (__m, __count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_srli_si64 (__m64 __m, int __count) +{ + return (__m64) __builtin_ia32_psrlqi ((__v1di)__m, __count); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psrlqi (__m64 __m, int __count) +{ + return _mm_srli_si64 (__m, __count); +} + +/* Bit-wise AND the 64-bit values in M1 and M2. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_and_si64 (__m64 __m1, __m64 __m2) +{ + return __builtin_ia32_pand (__m1, __m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pand (__m64 __m1, __m64 __m2) +{ + return _mm_and_si64 (__m1, __m2); +} + +/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the + 64-bit value in M2. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_andnot_si64 (__m64 __m1, __m64 __m2) +{ + return __builtin_ia32_pandn (__m1, __m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pandn (__m64 __m1, __m64 __m2) +{ + return _mm_andnot_si64 (__m1, __m2); +} + +/* Bit-wise inclusive OR the 64-bit values in M1 and M2. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_or_si64 (__m64 __m1, __m64 __m2) +{ + return __builtin_ia32_por (__m1, __m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_por (__m64 __m1, __m64 __m2) +{ + return _mm_or_si64 (__m1, __m2); +} + +/* Bit-wise exclusive OR the 64-bit values in M1 and M2. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_xor_si64 (__m64 __m1, __m64 __m2) +{ + return __builtin_ia32_pxor (__m1, __m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pxor (__m64 __m1, __m64 __m2) +{ + return _mm_xor_si64 (__m1, __m2); +} + +/* Compare eight 8-bit values. The result of the comparison is 0xFF if the + test is true and zero if false. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_pi8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_pcmpeqb ((__v8qi)__m1, (__v8qi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pcmpeqb (__m64 __m1, __m64 __m2) +{ + return _mm_cmpeq_pi8 (__m1, __m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_pi8 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_pcmpgtb ((__v8qi)__m1, (__v8qi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pcmpgtb (__m64 __m1, __m64 __m2) +{ + return _mm_cmpgt_pi8 (__m1, __m2); +} + +/* Compare four 16-bit values. The result of the comparison is 0xFFFF if + the test is true and zero if false. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_pcmpeqw ((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pcmpeqw (__m64 __m1, __m64 __m2) +{ + return _mm_cmpeq_pi16 (__m1, __m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_pi16 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_pcmpgtw ((__v4hi)__m1, (__v4hi)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pcmpgtw (__m64 __m1, __m64 __m2) +{ + return _mm_cmpgt_pi16 (__m1, __m2); +} + +/* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if + the test is true and zero if false. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_pi32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_pcmpeqd ((__v2si)__m1, (__v2si)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pcmpeqd (__m64 __m1, __m64 __m2) +{ + return _mm_cmpeq_pi32 (__m1, __m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_pi32 (__m64 __m1, __m64 __m2) +{ + return (__m64) __builtin_ia32_pcmpgtd ((__v2si)__m1, (__v2si)__m2); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pcmpgtd (__m64 __m1, __m64 __m2) +{ + return _mm_cmpgt_pi32 (__m1, __m2); +} + +/* Creates a 64-bit zero. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setzero_si64 (void) +{ + return (__m64)0LL; +} + +/* Creates a vector of two 32-bit values; I0 is least significant. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_pi32 (int __i1, int __i0) +{ + return (__m64) __builtin_ia32_vec_init_v2si (__i0, __i1); +} + +/* Creates a vector of four 16-bit values; W0 is least significant. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_pi16 (short __w3, short __w2, short __w1, short __w0) +{ + return (__m64) __builtin_ia32_vec_init_v4hi (__w0, __w1, __w2, __w3); +} + +/* Creates a vector of eight 8-bit values; B0 is least significant. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_pi8 (char __b7, char __b6, char __b5, char __b4, + char __b3, char __b2, char __b1, char __b0) +{ + return (__m64) __builtin_ia32_vec_init_v8qi (__b0, __b1, __b2, __b3, + __b4, __b5, __b6, __b7); +} + +/* Similar, but with the arguments in reverse order. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_pi32 (int __i0, int __i1) +{ + return _mm_set_pi32 (__i1, __i0); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_pi16 (short __w0, short __w1, short __w2, short __w3) +{ + return _mm_set_pi16 (__w3, __w2, __w1, __w0); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_pi8 (char __b0, char __b1, char __b2, char __b3, + char __b4, char __b5, char __b6, char __b7) +{ + return _mm_set_pi8 (__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); +} + +/* Creates a vector of two 32-bit values, both elements containing I. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_pi32 (int __i) +{ + return _mm_set_pi32 (__i, __i); +} + +/* Creates a vector of four 16-bit values, all elements containing W. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_pi16 (short __w) +{ + return _mm_set_pi16 (__w, __w, __w, __w); +} + +/* Creates a vector of eight 8-bit values, all elements containing B. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_pi8 (char __b) +{ + return _mm_set_pi8 (__b, __b, __b, __b, __b, __b, __b, __b); +} +#ifdef __DISABLE_MMX__ +#undef __DISABLE_MMX__ +#pragma GCC pop_options +#endif /* __DISABLE_MMX__ */ + +#endif /* _MMINTRIN_H_INCLUDED */ diff --git a/include-gcc/movdirintrin.h b/include-gcc/movdirintrin.h new file mode 100644 index 0000000..92b500e --- /dev/null +++ b/include-gcc/movdirintrin.h @@ -0,0 +1,74 @@ +/* Copyright (C) 2018-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _MOVDIRINTRIN_H_INCLUDED +#define _MOVDIRINTRIN_H_INCLUDED + +#ifndef __MOVDIRI__ +#pragma GCC push_options +#pragma GCC target ("movdiri") +#define __DISABLE_MOVDIRI__ +#endif /* __MOVDIRI__ */ + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_directstoreu_u32 (void * __P, unsigned int __A) +{ + __builtin_ia32_directstoreu_u32 ((unsigned int *)__P, __A); +} +#ifdef __x86_64__ +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_directstoreu_u64 (void * __P, unsigned long long __A) +{ + __builtin_ia32_directstoreu_u64 ((unsigned long long *)__P, __A); +} +#endif + +#ifdef __DISABLE_MOVDIRI__ +#undef __DISABLE_MOVDIRI__ +#pragma GCC pop_options +#endif /* __DISABLE_MOVDIRI__ */ + +#ifndef __MOVDIR64B__ +#pragma GCC push_options +#pragma GCC target ("movdir64b") +#define __DISABLE_MOVDIR64B__ +#endif /* __MOVDIR64B__ */ + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_movdir64b (void * __P, const void * __Q) +{ + __builtin_ia32_movdir64b (__P, __Q); +} + +#ifdef __DISABLE_MOVDIR64B__ +#undef __DISABLE_MOVDIR64B__ +#pragma GCC pop_options +#endif /* __DISABLE_MOVDIR64B__ */ +#endif /* _MOVDIRINTRIN_H_INCLUDED. */ diff --git a/include-gcc/mwaitintrin.h b/include-gcc/mwaitintrin.h new file mode 100644 index 0000000..9ade96b --- /dev/null +++ b/include-gcc/mwaitintrin.h @@ -0,0 +1,52 @@ +/* Copyright (C) 2021-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _MWAITINTRIN_H_INCLUDED +#define _MWAITINTRIN_H_INCLUDED + +#ifndef __MWAIT__ +#pragma GCC push_options +#pragma GCC target("mwait") +#define __DISABLE_MWAIT__ +#endif /* __MWAIT__ */ + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_monitor (void const * __P, unsigned int __E, unsigned int __H) +{ + __builtin_ia32_monitor (__P, __E, __H); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mwait (unsigned int __E, unsigned int __H) +{ + __builtin_ia32_mwait (__E, __H); +} + +#ifdef __DISABLE_MWAIT__ +#undef __DISABLE_MWAIT__ +#pragma GCC pop_options +#endif /* __DISABLE_MWAIT__ */ + +#endif /* _MWAITINTRIN_H_INCLUDED */ diff --git a/include-gcc/mwaitxintrin.h b/include-gcc/mwaitxintrin.h new file mode 100644 index 0000000..4dc1c9c --- /dev/null +++ b/include-gcc/mwaitxintrin.h @@ -0,0 +1,50 @@ +/* Copyright (C) 2012-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _MWAITXINTRIN_H_INCLUDED +#define _MWAITXINTRIN_H_INCLUDED + +#ifndef __MWAITX__ +#pragma GCC push_options +#pragma GCC target("mwaitx") +#define __DISABLE_MWAITX__ +#endif /* __MWAITX__ */ + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_monitorx (void const * __P, unsigned int __E, unsigned int __H) +{ + __builtin_ia32_monitorx (__P, __E, __H); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mwaitx (unsigned int __E, unsigned int __H, unsigned int __C) +{ + __builtin_ia32_mwaitx (__E, __H, __C); +} + +#ifdef __DISABLE_MWAITX__ +#undef __DISABLE_MWAITX__ +#pragma GCC pop_options +#endif /* __DISABLE_MWAITX__ */ + +#endif /* _MWAITXINTRIN_H_INCLUDED */ diff --git a/include-gcc/pconfigintrin.h b/include-gcc/pconfigintrin.h new file mode 100644 index 0000000..bd8252a --- /dev/null +++ b/include-gcc/pconfigintrin.h @@ -0,0 +1,78 @@ +/* Copyright (C) 2018-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _PCONFIGINTRIN_H_INCLUDED +#define _PCONFIGINTRIN_H_INCLUDED + +#ifndef __PCONFIG__ +#pragma GCC push_options +#pragma GCC target("pconfig") +#define __DISABLE_PCONFIG__ +#endif /* __PCONFIG__ */ + +#define __pconfig_b(leaf, b, retval) \ + __asm__ __volatile__ ("pconfig\n\t" \ + : "=a" (retval) \ + : "a" (leaf), "b" (b) \ + : "cc") + +#define __pconfig_generic(leaf, b, c, d, retval) \ + __asm__ __volatile__ ("pconfig\n\t" \ + : "=a" (retval), "=b" (b), "=c" (c), "=d" (d) \ + : "a" (leaf), "b" (b), "c" (c), "d" (d) \ + : "cc") + +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_pconfig_u32 (const unsigned int __L, size_t __D[]) +{ + enum __pconfig_type + { + __PCONFIG_KEY_PROGRAM = 0x01, + }; + + unsigned int __R = 0; + + if (!__builtin_constant_p (__L)) + __pconfig_generic (__L, __D[0], __D[1], __D[2], __R); + else switch (__L) + { + case __PCONFIG_KEY_PROGRAM: + __pconfig_b (__L, __D[0], __R); + break; + default: + __pconfig_generic (__L, __D[0], __D[1], __D[2], __R); + } + return __R; +} + +#ifdef __DISABLE_PCONFIG__ +#undef __DISABLE_PCONFIG__ +#pragma GCC pop_options +#endif /* __DISABLE_PCONFIG__ */ + +#endif /* _PCONFIGINTRIN_H_INCLUDED */ diff --git a/include-gcc/pkuintrin.h b/include-gcc/pkuintrin.h new file mode 100644 index 0000000..257b5b8 --- /dev/null +++ b/include-gcc/pkuintrin.h @@ -0,0 +1,56 @@ +/* Copyright (C) 2015-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _PKUINTRIN_H_INCLUDED +#define _PKUINTRIN_H_INCLUDED + +#ifndef __PKU__ +#pragma GCC push_options +#pragma GCC target("pku") +#define __DISABLE_PKU__ +#endif /* __PKU__ */ + +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_rdpkru_u32 (void) +{ + return __builtin_ia32_rdpkru (); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_wrpkru (unsigned int __key) +{ + __builtin_ia32_wrpkru (__key); +} + +#ifdef __DISABLE_PKU__ +#undef __DISABLE_PKU__ +#pragma GCC pop_options +#endif /* __DISABLE_PKU__ */ + +#endif /* _PKUINTRIN_H_INCLUDED */ diff --git a/include-gcc/pmmintrin.h b/include-gcc/pmmintrin.h new file mode 100644 index 0000000..b2674ec --- /dev/null +++ b/include-gcc/pmmintrin.h @@ -0,0 +1,121 @@ +/* Copyright (C) 2003-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* Implemented from the specification included in the Intel C++ Compiler + User Guide and Reference, version 9.0. */ + +#ifndef _PMMINTRIN_H_INCLUDED +#define _PMMINTRIN_H_INCLUDED + +/* We need definitions from the SSE2 and SSE header files*/ +#include +#include + +#ifndef __SSE3__ +#pragma GCC push_options +#pragma GCC target("sse3") +#define __DISABLE_SSE3__ +#endif /* __SSE3__ */ + +/* Additional bits in the MXCSR. */ +#define _MM_DENORMALS_ZERO_MASK 0x0040 +#define _MM_DENORMALS_ZERO_ON 0x0040 +#define _MM_DENORMALS_ZERO_OFF 0x0000 + +#define _MM_SET_DENORMALS_ZERO_MODE(mode) \ + _mm_setcsr ((_mm_getcsr () & ~_MM_DENORMALS_ZERO_MASK) | (mode)) +#define _MM_GET_DENORMALS_ZERO_MODE() \ + (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK) + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_addsub_ps (__m128 __X, __m128 __Y) +{ + return (__m128) __builtin_ia32_addsubps ((__v4sf)__X, (__v4sf)__Y); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadd_ps (__m128 __X, __m128 __Y) +{ + return (__m128) __builtin_ia32_haddps ((__v4sf)__X, (__v4sf)__Y); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsub_ps (__m128 __X, __m128 __Y) +{ + return (__m128) __builtin_ia32_hsubps ((__v4sf)__X, (__v4sf)__Y); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movehdup_ps (__m128 __X) +{ + return (__m128) __builtin_ia32_movshdup ((__v4sf)__X); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_moveldup_ps (__m128 __X) +{ + return (__m128) __builtin_ia32_movsldup ((__v4sf)__X); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_addsub_pd (__m128d __X, __m128d __Y) +{ + return (__m128d) __builtin_ia32_addsubpd ((__v2df)__X, (__v2df)__Y); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadd_pd (__m128d __X, __m128d __Y) +{ + return (__m128d) __builtin_ia32_haddpd ((__v2df)__X, (__v2df)__Y); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsub_pd (__m128d __X, __m128d __Y) +{ + return (__m128d) __builtin_ia32_hsubpd ((__v2df)__X, (__v2df)__Y); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loaddup_pd (double const *__P) +{ + return _mm_load1_pd (__P); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movedup_pd (__m128d __X) +{ + return _mm_shuffle_pd (__X, __X, _MM_SHUFFLE2 (0,0)); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_lddqu_si128 (__m128i const *__P) +{ + return (__m128i) __builtin_ia32_lddqu ((char const *)__P); +} + +#ifdef __DISABLE_SSE3__ +#undef __DISABLE_SSE3__ +#pragma GCC pop_options +#endif /* __DISABLE_SSE3__ */ + +#endif /* _PMMINTRIN_H_INCLUDED */ diff --git a/include-gcc/popcntintrin.h b/include-gcc/popcntintrin.h new file mode 100644 index 0000000..b039d5f --- /dev/null +++ b/include-gcc/popcntintrin.h @@ -0,0 +1,53 @@ +/* Copyright (C) 2009-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _POPCNTINTRIN_H_INCLUDED +#define _POPCNTINTRIN_H_INCLUDED + +#ifndef __POPCNT__ +#pragma GCC push_options +#pragma GCC target("popcnt") +#define __DISABLE_POPCNT__ +#endif /* __POPCNT__ */ + +/* Calculate a number of bits set to 1. */ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_popcnt_u32 (unsigned int __X) +{ + return __builtin_popcount (__X); +} + +#ifdef __x86_64__ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_popcnt_u64 (unsigned long long __X) +{ + return __builtin_popcountll (__X); +} +#endif + +#ifdef __DISABLE_POPCNT__ +#undef __DISABLE_POPCNT__ +#pragma GCC pop_options +#endif /* __DISABLE_POPCNT__ */ + +#endif /* _POPCNTINTRIN_H_INCLUDED */ diff --git a/include-gcc/prfchiintrin.h b/include-gcc/prfchiintrin.h new file mode 100644 index 0000000..382fc07 --- /dev/null +++ b/include-gcc/prfchiintrin.h @@ -0,0 +1,61 @@ +/* Copyright (C) 2022-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#if !defined _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _PRFCHIINTRIN_H_INCLUDED +#define _PRFCHIINTRIN_H_INCLUDED + +#ifdef __x86_64__ + + +#ifndef __PREFETCHI__ +#pragma GCC push_options +#pragma GCC target("prefetchi") +#define __DISABLE_PREFETCHI__ +#endif /* __PREFETCHI__ */ + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_prefetchit0 (void* __P) +{ + __builtin_ia32_prefetchi (__P, 3); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_prefetchit1 (void* __P) +{ + __builtin_ia32_prefetchi (__P, 2); +} + +#ifdef __DISABLE_PREFETCHI__ +#undef __DISABLE_PREFETCHI__ +#pragma GCC pop_options +#endif /* __DISABLE_PREFETCHI__ */ + +#endif /* __x86_64__ */ + +#endif /* _PRFCHIINTRIN_H_INCLUDED */ diff --git a/include-gcc/prfchwintrin.h b/include-gcc/prfchwintrin.h new file mode 100644 index 0000000..f652997 --- /dev/null +++ b/include-gcc/prfchwintrin.h @@ -0,0 +1,37 @@ +/* Copyright (C) 2012-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#if !defined _IMMINTRIN_H_INCLUDED && !defined _MM3DNOW_H_INCLUDED +# error "Never use directly; include or instead." +#endif + +#ifndef _PRFCHWINTRIN_H_INCLUDED +#define _PRFCHWINTRIN_H_INCLUDED + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_prefetchw (void *__P) +{ + __builtin_prefetch (__P, 1, 3 /* _MM_HINT_T0 */); +} + +#endif /* _PRFCHWINTRIN_H_INCLUDED */ diff --git a/include-gcc/raointintrin.h b/include-gcc/raointintrin.h new file mode 100644 index 0000000..ad9fbaf --- /dev/null +++ b/include-gcc/raointintrin.h @@ -0,0 +1,100 @@ +/* Copyright (C) 2019-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif // _X86GPRINTRIN_H_INCLUDED + +#ifndef __RAOINTINTRIN_H_INCLUDED +#define __RAOINTINTRIN_H_INCLUDED + +#ifndef __RAOINT__ +#pragma GCC push_options +#pragma GCC target("raoint") +#define __DISABLE_RAOINT__ +#endif /* __RAOINT__ */ + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_aadd_i32 (int *__A, int __B) +{ + __builtin_ia32_aadd32 ((int *)__A, __B); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_aand_i32 (int *__A, int __B) +{ + __builtin_ia32_aand32 ((int *)__A, __B); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_aor_i32 (int *__A, int __B) +{ + __builtin_ia32_aor32 ((int *)__A, __B); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_axor_i32 (int *__A, int __B) +{ + __builtin_ia32_axor32 ((int *)__A, __B); +} + +#ifdef __x86_64__ +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_aadd_i64 (long long *__A, long long __B) +{ + __builtin_ia32_aadd64 ((long long *)__A, __B); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_aand_i64 (long long *__A, long long __B) +{ + __builtin_ia32_aand64 ((long long *)__A, __B); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_aor_i64 (long long *__A, long long __B) +{ + __builtin_ia32_aor64 ((long long *)__A, __B); +} + +extern __inline void +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_axor_i64 (long long *__A, long long __B) +{ + __builtin_ia32_axor64 ((long long *)__A, __B); +} +#endif /* __x86_64__ */ + +#ifdef __DISABLE_RAOINT__ +#undef __DISABLE_RAOINT__ +#pragma GCC pop_options +#endif /* __DISABLE_RAOINT__ */ + +#endif /* __RAOINTINTRIN_H_INCLUDED */ diff --git a/include-gcc/rdseedintrin.h b/include-gcc/rdseedintrin.h new file mode 100644 index 0000000..50f45bd --- /dev/null +++ b/include-gcc/rdseedintrin.h @@ -0,0 +1,66 @@ +/* Copyright (C) 2012-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _RDSEEDINTRIN_H_INCLUDED +#define _RDSEEDINTRIN_H_INCLUDED + +#ifndef __RDSEED__ +#pragma GCC push_options +#pragma GCC target("rdseed") +#define __DISABLE_RDSEED__ +#endif /* __RDSEED__ */ + + +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_rdseed16_step (unsigned short *__p) +{ + return __builtin_ia32_rdseed_hi_step (__p); +} + +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_rdseed32_step (unsigned int *__p) +{ + return __builtin_ia32_rdseed_si_step (__p); +} + +#ifdef __x86_64__ +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_rdseed64_step (unsigned long long *__p) +{ + return __builtin_ia32_rdseed_di_step (__p); +} +#endif + +#ifdef __DISABLE_RDSEED__ +#undef __DISABLE_RDSEED__ +#pragma GCC pop_options +#endif /* __DISABLE_RDSEED__ */ + +#endif /* _RDSEEDINTRIN_H_INCLUDED */ diff --git a/include-gcc/rtmintrin.h b/include-gcc/rtmintrin.h new file mode 100644 index 0000000..f722f5b --- /dev/null +++ b/include-gcc/rtmintrin.h @@ -0,0 +1,84 @@ +/* Copyright (C) 2012-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _RTMINTRIN_H_INCLUDED +#define _RTMINTRIN_H_INCLUDED + +#ifndef __RTM__ +#pragma GCC push_options +#pragma GCC target("rtm") +#define __DISABLE_RTM__ +#endif /* __RTM__ */ + +#define _XBEGIN_STARTED (~0u) +#define _XABORT_EXPLICIT (1 << 0) +#define _XABORT_RETRY (1 << 1) +#define _XABORT_CONFLICT (1 << 2) +#define _XABORT_CAPACITY (1 << 3) +#define _XABORT_DEBUG (1 << 4) +#define _XABORT_NESTED (1 << 5) +#define _XABORT_CODE(x) (((x) >> 24) & 0xFF) + +/* Start an RTM code region. Return _XBEGIN_STARTED on success and the + abort condition otherwise. */ +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xbegin (void) +{ + return __builtin_ia32_xbegin (); +} + +/* Specify the end of an RTM code region. If it corresponds to the + outermost transaction, then attempts the transaction commit. If the + commit fails, then control is transferred to the outermost transaction + fallback handler. */ +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xend (void) +{ + __builtin_ia32_xend (); +} + +/* Force an RTM abort condition. The control is transferred to the + outermost transaction fallback handler with the abort condition IMM. */ +#ifdef __OPTIMIZE__ +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xabort (const unsigned int __imm) +{ + __builtin_ia32_xabort (__imm); +} +#else +#define _xabort(N) __builtin_ia32_xabort (N) +#endif /* __OPTIMIZE__ */ + +#ifdef __DISABLE_RTM__ +#undef __DISABLE_RTM__ +#pragma GCC pop_options +#endif /* __DISABLE_RTM__ */ + +#endif /* _RTMINTRIN_H_INCLUDED */ diff --git a/include-gcc/serializeintrin.h b/include-gcc/serializeintrin.h new file mode 100644 index 0000000..d5da003 --- /dev/null +++ b/include-gcc/serializeintrin.h @@ -0,0 +1,49 @@ +/* Copyright (C) 2018-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _SERIALIZE_H_INCLUDED +#define _SERIALIZE_H_INCLUDED + +#ifndef __SERIALIZE__ +#pragma GCC push_options +#pragma GCC target("serialize") +#define __DISABLE_SERIALIZE__ +#endif /* __SERIALIZE__ */ + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_serialize (void) +{ + __builtin_ia32_serialize (); +} + +#ifdef __DISABLE_SERIALIZE__ +#undef __DISABLE_SERIALIZE__ +#pragma GCC pop_options +#endif /* __DISABLE_SERIALIZE__ */ + +#endif /* _SERIALIZE_H_INCLUDED. */ diff --git a/include-gcc/sgxintrin.h b/include-gcc/sgxintrin.h new file mode 100644 index 0000000..e12fa16 --- /dev/null +++ b/include-gcc/sgxintrin.h @@ -0,0 +1,253 @@ +/* Copyright (C) 2017-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _SGXINTRIN_H_INCLUDED +#define _SGXINTRIN_H_INCLUDED + +#ifndef __SGX__ +#pragma GCC push_options +#pragma GCC target("sgx") +#define __DISABLE_SGX__ +#endif /* __SGX__ */ + +#define __encls_bc(leaf, b, c, retval) \ + __asm__ __volatile__ ("encls\n\t" \ + : "=a" (retval) \ + : "a" (leaf), "b" (b), "c" (c) \ + : "cc") + +#define __encls_bcd(leaf, b, c, d, retval) \ + __asm__ __volatile__("encls\n\t" \ + : "=a" (retval) \ + : "a" (leaf), "b" (b), "c" (c), "d" (d) \ + : "cc") + +#define __encls_c(leaf, c, retval) \ + __asm__ __volatile__("encls\n\t" \ + : "=a" (retval) \ + : "a" (leaf), "c" (c) \ + : "cc") + +#define __encls_edbgrd(leaf, b, c, retval) \ + __asm__ __volatile__("encls\n\t" \ + : "=a" (retval), "=b" (b) \ + : "a" (leaf), "c" (c)) + +#define __encls_generic(leaf, b, c, d, retval) \ + __asm__ __volatile__("encls\n\t" \ + : "=a" (retval), "=b" (b), "=c" (c), "=d" (d)\ + : "a" (leaf), "b" (b), "c" (c), "d" (d) \ + : "cc") + +#define __enclu_bc(leaf, b, c, retval) \ + __asm__ __volatile__("enclu\n\t" \ + : "=a" (retval) \ + : "a" (leaf), "b" (b), "c" (c) \ + : "cc") + +#define __enclu_bcd(leaf, b, c, d, retval) \ + __asm__ __volatile__("enclu\n\t" \ + : "=a" (retval) \ + : "a" (leaf), "b" (b), "c" (c), "d" (d) \ + : "cc") + +#define __enclu_eenter(leaf, b, c, retval) \ + __asm__ __volatile__("enclu\n\t" \ + : "=a" (retval), "=c" (c) \ + : "a" (leaf), "b" (b), "c" (c) \ + : "cc") + +#define __enclu_eexit(leaf, b, c, retval) \ + __asm__ __volatile__("enclu\n\t" \ + : "=a" (retval), "=c" (c) \ + : "a" (leaf), "b" (b) \ + : "cc") + +#define __enclu_generic(leaf, b, c, d, retval) \ + __asm__ __volatile__("enclu\n\t" \ + : "=a" (retval), "=b" (b), "=c" (c), "=d" (d)\ + : "a" (leaf), "b" (b), "c" (c), "d" (d) \ + : "cc") + +#define __enclv_bc(leaf, b, c, retval) \ + __asm__ __volatile__("enclv\n\t" \ + : "=a" (retval) \ + : "a" (leaf), "b" (b), "c" (c) \ + : "cc") + +#define __enclv_cd(leaf, c, d, retval) \ + __asm__ __volatile__("enclv\n\t" \ + : "=a" (retval) \ + : "a" (leaf), "c" (c), "d" (d) \ + : "cc") + +#define __enclv_generic(leaf, b, c, d, retval) \ + __asm__ __volatile__("enclv\n\t" \ + : "=a" (retval), "=b" (b), "=c" (b), "=d" (d)\ + : "a" (leaf), "b" (b), "c" (c), "d" (d) \ + : "cc") + +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_encls_u32 (const unsigned int __L, size_t __D[]) +{ + enum __encls_type + { + __SGX_ECREATE = 0x00, + __SGX_EADD = 0x01, + __SGX_EINIT = 0x02, + __SGX_EREMOVE = 0x03, + __SGX_EDBGRD = 0x04, + __SGX_EDBGWR = 0x05, + __SGX_EEXTEND = 0x06, + __SGX_ELDB = 0x07, + __SGX_ELDU = 0x08, + __SGX_EBLOCK = 0x09, + __SGX_EPA = 0x0A, + __SGX_EWB = 0x0B, + __SGX_ETRACK = 0x0C, + __SGX_EAUG = 0x0D, + __SGX_EMODPR = 0x0E, + __SGX_EMODT = 0x0F, + __SGX_ERDINFO = 0x10, + __SGX_ETRACKC = 0x11, + __SGX_ELDBC = 0x12, + __SGX_ELDUC = 0x13 + }; + enum __encls_type __T = (enum __encls_type)__L; + unsigned int __R = 0; + if (!__builtin_constant_p (__T)) + __encls_generic (__L, __D[0], __D[1], __D[2], __R); + else switch (__T) + { + case __SGX_ECREATE: + case __SGX_EADD: + case __SGX_EDBGWR: + case __SGX_EEXTEND: + case __SGX_EPA: + case __SGX_EMODPR: + case __SGX_EMODT: + case __SGX_EAUG: + case __SGX_ERDINFO: + __encls_bc (__L, __D[0], __D[1], __R); + break; + case __SGX_EINIT: + case __SGX_ELDB: + case __SGX_ELDU: + case __SGX_EWB: + case __SGX_ELDBC: + case __SGX_ELDUC: + __encls_bcd (__L, __D[0], __D[1], __D[2], __R); + break; + case __SGX_EREMOVE: + case __SGX_EBLOCK: + case __SGX_ETRACK: + case __SGX_ETRACKC: + __encls_c (__L, __D[1], __R); + break; + case __SGX_EDBGRD: + __encls_edbgrd (__L, __D[0], __D[1], __R); + break; + default: + __encls_generic (__L, __D[0], __D[1], __D[2], __R); + } + return __R; +} + +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_enclu_u32 (const unsigned int __L, size_t __D[]) +{ + enum __enclu_type + { + __SGX_EREPORT = 0x00, + __SGX_EGETKEY = 0x01, + __SGX_EENTER = 0x02, + __SGX_ERESUME = 0x03, + __SGX_EEXIT = 0x04, + __SGX_EACCEPT = 0x05, + __SGX_EMODPE = 0x06, + __SGX_EACCEPTCOPY = 0x07 + }; + enum __enclu_type __T = (enum __enclu_type) __L; + unsigned int __R = 0; + if (!__builtin_constant_p (__T)) + __enclu_generic (__L, __D[0], __D[1], __D[2], __R); + else switch (__T) + { + case __SGX_EREPORT: + case __SGX_EACCEPTCOPY: + __enclu_bcd (__L, __D[0], __D[1], __D[2], __R); + break; + case __SGX_EGETKEY: + case __SGX_ERESUME: + case __SGX_EACCEPT: + case __SGX_EMODPE: + __enclu_bc (__L, __D[0], __D[1], __R); + break; + case __SGX_EENTER: + __enclu_eenter (__L, __D[0], __D[1], __R); + break; + case __SGX_EEXIT: + __enclu_eexit (__L, __D[0], __D[1], __R); + break; + default: + __enclu_generic (__L, __D[0], __D[1], __D[2], __R); + } + return __R; +} + +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_enclv_u32 (const unsigned int __L, size_t __D[]) +{ + enum __enclv_type + { + __SGX_EDECVIRTCHILD = 0x00, + __SGX_EINCVIRTCHILD = 0x01, + __SGX_ESETCONTEXT = 0x02 + }; + unsigned int __R = 0; + if (!__builtin_constant_p (__L)) + __enclv_generic (__L, __D[0], __D[1], __D[2], __R); + else switch (__L) + { + case __SGX_EDECVIRTCHILD: + case __SGX_EINCVIRTCHILD: + __enclv_bc (__L, __D[0], __D[1], __R); + break; + case __SGX_ESETCONTEXT: + __enclv_cd (__L, __D[1], __D[2], __R); + break; + default: + __enclv_generic (__L, __D[0], __D[1], __D[2], __R); + } + return __R; +} + +#ifdef __DISABLE_SGX__ +#undef __DISABLE_SGX__ +#pragma GCC pop_options +#endif /* __DISABLE_SGX__ */ + +#endif /* _SGXINTRIN_H_INCLUDED */ diff --git a/include-gcc/shaintrin.h b/include-gcc/shaintrin.h new file mode 100644 index 0000000..ea85e31 --- /dev/null +++ b/include-gcc/shaintrin.h @@ -0,0 +1,98 @@ +/* Copyright (C) 2013-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _SHAINTRIN_H_INCLUDED +#define _SHAINTRIN_H_INCLUDED + +#ifndef __SHA__ +#pragma GCC push_options +#pragma GCC target("sha") +#define __DISABLE_SHA__ +#endif /* __SHA__ */ + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha1msg1_epu32 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_sha1msg1 ((__v4si) __A, (__v4si) __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha1msg2_epu32 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_sha1msg2 ((__v4si) __A, (__v4si) __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha1nexte_epu32 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_sha1nexte ((__v4si) __A, (__v4si) __B); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha1rnds4_epu32 (__m128i __A, __m128i __B, const int __I) +{ + return (__m128i) __builtin_ia32_sha1rnds4 ((__v4si) __A, (__v4si) __B, __I); +} +#else +#define _mm_sha1rnds4_epu32(A, B, I) \ + ((__m128i) __builtin_ia32_sha1rnds4 ((__v4si)(__m128i)(A), \ + (__v4si)(__m128i)(B), (int)(I))) +#endif + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha256msg1_epu32 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_sha256msg1 ((__v4si) __A, (__v4si) __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha256msg2_epu32 (__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_sha256msg2 ((__v4si) __A, (__v4si) __B); +} + +extern __inline __m128i +__attribute__ ((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha256rnds2_epu32 (__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_sha256rnds2 ((__v4si) __A, (__v4si) __B, + (__v4si) __C); +} + +#ifdef __DISABLE_SHA__ +#undef __DISABLE_SHA__ +#pragma GCC pop_options +#endif /* __DISABLE_SHA__ */ + +#endif /* _SHAINTRIN_H_INCLUDED */ diff --git a/include-gcc/smmintrin.h b/include-gcc/smmintrin.h new file mode 100644 index 0000000..1605acb --- /dev/null +++ b/include-gcc/smmintrin.h @@ -0,0 +1,852 @@ +/* Copyright (C) 2007-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* Implemented from the specification included in the Intel C++ Compiler + User Guide and Reference, version 10.0. */ + +#ifndef _SMMINTRIN_H_INCLUDED +#define _SMMINTRIN_H_INCLUDED + +/* We need definitions from the SSSE3, SSE3, SSE2 and SSE header + files. */ +#include + +#ifndef __SSE4_1__ +#pragma GCC push_options +#pragma GCC target("sse4.1") +#define __DISABLE_SSE4_1__ +#endif /* __SSE4_1__ */ + +/* Rounding mode macros. */ +#define _MM_FROUND_TO_NEAREST_INT 0x00 +#define _MM_FROUND_TO_NEG_INF 0x01 +#define _MM_FROUND_TO_POS_INF 0x02 +#define _MM_FROUND_TO_ZERO 0x03 +#define _MM_FROUND_CUR_DIRECTION 0x04 + +#define _MM_FROUND_RAISE_EXC 0x00 +#define _MM_FROUND_NO_EXC 0x08 + +#define _MM_FROUND_NINT \ + (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_FLOOR \ + (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_CEIL \ + (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_TRUNC \ + (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_RINT \ + (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC) +#define _MM_FROUND_NEARBYINT \ + (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC) + +/* Test Instruction */ +/* Packed integer 128-bit bitwise comparison. Return 1 if + (__V & __M) == 0. */ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testz_si128 (__m128i __M, __m128i __V) +{ + return __builtin_ia32_ptestz128 ((__v2di)__M, (__v2di)__V); +} + +/* Packed integer 128-bit bitwise comparison. Return 1 if + (__V & ~__M) == 0. */ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testc_si128 (__m128i __M, __m128i __V) +{ + return __builtin_ia32_ptestc128 ((__v2di)__M, (__v2di)__V); +} + +/* Packed integer 128-bit bitwise comparison. Return 1 if + (__V & __M) != 0 && (__V & ~__M) != 0. */ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_testnzc_si128 (__m128i __M, __m128i __V) +{ + return __builtin_ia32_ptestnzc128 ((__v2di)__M, (__v2di)__V); +} + +/* Macros for packed integer 128-bit comparison intrinsics. */ +#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V)) + +#define _mm_test_all_ones(V) \ + _mm_testc_si128 ((V), _mm_cmpeq_epi32 ((V), (V))) + +#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128 ((M), (V)) + +/* Packed/scalar double precision floating point rounding. */ + +#ifdef __OPTIMIZE__ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_round_pd (__m128d __V, const int __M) +{ + return (__m128d) __builtin_ia32_roundpd ((__v2df)__V, __M); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_round_sd(__m128d __D, __m128d __V, const int __M) +{ + return (__m128d) __builtin_ia32_roundsd ((__v2df)__D, + (__v2df)__V, + __M); +} +#else +#define _mm_round_pd(V, M) \ + ((__m128d) __builtin_ia32_roundpd ((__v2df)(__m128d)(V), (int)(M))) + +#define _mm_round_sd(D, V, M) \ + ((__m128d) __builtin_ia32_roundsd ((__v2df)(__m128d)(D), \ + (__v2df)(__m128d)(V), (int)(M))) +#endif + +/* Packed/scalar single precision floating point rounding. */ + +#ifdef __OPTIMIZE__ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_round_ps (__m128 __V, const int __M) +{ + return (__m128) __builtin_ia32_roundps ((__v4sf)__V, __M); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_round_ss (__m128 __D, __m128 __V, const int __M) +{ + return (__m128) __builtin_ia32_roundss ((__v4sf)__D, + (__v4sf)__V, + __M); +} +#else +#define _mm_round_ps(V, M) \ + ((__m128) __builtin_ia32_roundps ((__v4sf)(__m128)(V), (int)(M))) + +#define _mm_round_ss(D, V, M) \ + ((__m128) __builtin_ia32_roundss ((__v4sf)(__m128)(D), \ + (__v4sf)(__m128)(V), (int)(M))) +#endif + +/* Macros for ceil/floor intrinsics. */ +#define _mm_ceil_pd(V) _mm_round_pd ((V), _MM_FROUND_CEIL) +#define _mm_ceil_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_CEIL) + +#define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR) +#define _mm_floor_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_FLOOR) + +#define _mm_ceil_ps(V) _mm_round_ps ((V), _MM_FROUND_CEIL) +#define _mm_ceil_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_CEIL) + +#define _mm_floor_ps(V) _mm_round_ps ((V), _MM_FROUND_FLOOR) +#define _mm_floor_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_FLOOR) + +/* SSE4.1 */ + +/* Integer blend instructions - select data from 2 sources using + constant/variable mask. */ + +#ifdef __OPTIMIZE__ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blend_epi16 (__m128i __X, __m128i __Y, const int __M) +{ + return (__m128i) __builtin_ia32_pblendw128 ((__v8hi)__X, + (__v8hi)__Y, + __M); +} +#else +#define _mm_blend_epi16(X, Y, M) \ + ((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(X), \ + (__v8hi)(__m128i)(Y), (int)(M))) +#endif + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blendv_epi8 (__m128i __X, __m128i __Y, __m128i __M) +{ + return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__X, + (__v16qi)__Y, + (__v16qi)__M); +} + +/* Single precision floating point blend instructions - select data + from 2 sources using constant/variable mask. */ + +#ifdef __OPTIMIZE__ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blend_ps (__m128 __X, __m128 __Y, const int __M) +{ + return (__m128) __builtin_ia32_blendps ((__v4sf)__X, + (__v4sf)__Y, + __M); +} +#else +#define _mm_blend_ps(X, Y, M) \ + ((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (int)(M))) +#endif + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blendv_ps (__m128 __X, __m128 __Y, __m128 __M) +{ + return (__m128) __builtin_ia32_blendvps ((__v4sf)__X, + (__v4sf)__Y, + (__v4sf)__M); +} + +/* Double precision floating point blend instructions - select data + from 2 sources using constant/variable mask. */ + +#ifdef __OPTIMIZE__ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blend_pd (__m128d __X, __m128d __Y, const int __M) +{ + return (__m128d) __builtin_ia32_blendpd ((__v2df)__X, + (__v2df)__Y, + __M); +} +#else +#define _mm_blend_pd(X, Y, M) \ + ((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (int)(M))) +#endif + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_blendv_pd (__m128d __X, __m128d __Y, __m128d __M) +{ + return (__m128d) __builtin_ia32_blendvpd ((__v2df)__X, + (__v2df)__Y, + (__v2df)__M); +} + +/* Dot product instructions with mask-defined summing and zeroing parts + of result. */ + +#ifdef __OPTIMIZE__ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_dp_ps (__m128 __X, __m128 __Y, const int __M) +{ + return (__m128) __builtin_ia32_dpps ((__v4sf)__X, + (__v4sf)__Y, + __M); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_dp_pd (__m128d __X, __m128d __Y, const int __M) +{ + return (__m128d) __builtin_ia32_dppd ((__v2df)__X, + (__v2df)__Y, + __M); +} +#else +#define _mm_dp_ps(X, Y, M) \ + ((__m128) __builtin_ia32_dpps ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (int)(M))) + +#define _mm_dp_pd(X, Y, M) \ + ((__m128d) __builtin_ia32_dppd ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (int)(M))) +#endif + +/* Packed integer 64-bit comparison, zeroing or filling with ones + corresponding parts of result. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_epi64 (__m128i __X, __m128i __Y) +{ + return (__m128i) ((__v2di)__X == (__v2di)__Y); +} + +/* Min/max packed integer instructions. */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epi8 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pminsb128 ((__v16qi)__X, (__v16qi)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epi8 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi)__X, (__v16qi)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epu16 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pminuw128 ((__v8hi)__X, (__v8hi)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epu16 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi)__X, (__v8hi)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pminsd128 ((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_epu32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pminud128 ((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_epu32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmaxud128 ((__v4si)__X, (__v4si)__Y); +} + +/* Packed integer 32-bit multiplication with truncation of upper + halves of results. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mullo_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) ((__v4su)__X * (__v4su)__Y); +} + +/* Packed integer 32-bit multiplication of 2 pairs of operands + with two 64-bit results. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__X, (__v4si)__Y); +} + +/* Insert single precision float into packed single precision array + element selected by index N. The bits [7-6] of N define S + index, the bits [5-4] define D index, and bits [3-0] define + zeroing mask for D. */ + +#ifdef __OPTIMIZE__ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_ps (__m128 __D, __m128 __S, const int __N) +{ + return (__m128) __builtin_ia32_insertps128 ((__v4sf)__D, + (__v4sf)__S, + __N); +} +#else +#define _mm_insert_ps(D, S, N) \ + ((__m128) __builtin_ia32_insertps128 ((__v4sf)(__m128)(D), \ + (__v4sf)(__m128)(S), (int)(N))) +#endif + +/* Helper macro to create the N value for _mm_insert_ps. */ +#define _MM_MK_INSERTPS_NDX(S, D, M) (((S) << 6) | ((D) << 4) | (M)) + +/* Extract binary representation of single precision float from packed + single precision array element of X selected by index N. */ + +#ifdef __OPTIMIZE__ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_ps (__m128 __X, const int __N) +{ + union { int __i; float __f; } __tmp; + __tmp.__f = __builtin_ia32_vec_ext_v4sf ((__v4sf)__X, __N); + return __tmp.__i; +} +#else +#define _mm_extract_ps(X, N) \ + (__extension__ \ + ({ \ + union { int __i; float __f; } __tmp; \ + __tmp.__f = __builtin_ia32_vec_ext_v4sf ((__v4sf)(__m128)(X), \ + (int)(N)); \ + __tmp.__i; \ + })) +#endif + +/* Extract binary representation of single precision float into + D from packed single precision array element of S selected + by index N. */ +#define _MM_EXTRACT_FLOAT(D, S, N) \ + { (D) = __builtin_ia32_vec_ext_v4sf ((__v4sf)(S), (N)); } + +/* Extract specified single precision float element into the lower + part of __m128. */ +#define _MM_PICK_OUT_PS(X, N) \ + _mm_insert_ps (_mm_setzero_ps (), (X), \ + _MM_MK_INSERTPS_NDX ((N), 0, 0x0e)) + +/* Insert integer, S, into packed integer array element of D + selected by index N. */ + +#ifdef __OPTIMIZE__ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_epi8 (__m128i __D, int __S, const int __N) +{ + return (__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)__D, + __S, __N); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_epi32 (__m128i __D, int __S, const int __N) +{ + return (__m128i) __builtin_ia32_vec_set_v4si ((__v4si)__D, + __S, __N); +} + +#ifdef __x86_64__ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_epi64 (__m128i __D, long long __S, const int __N) +{ + return (__m128i) __builtin_ia32_vec_set_v2di ((__v2di)__D, + __S, __N); +} +#endif +#else +#define _mm_insert_epi8(D, S, N) \ + ((__m128i) __builtin_ia32_vec_set_v16qi ((__v16qi)(__m128i)(D), \ + (int)(S), (int)(N))) + +#define _mm_insert_epi32(D, S, N) \ + ((__m128i) __builtin_ia32_vec_set_v4si ((__v4si)(__m128i)(D), \ + (int)(S), (int)(N))) + +#ifdef __x86_64__ +#define _mm_insert_epi64(D, S, N) \ + ((__m128i) __builtin_ia32_vec_set_v2di ((__v2di)(__m128i)(D), \ + (long long)(S), (int)(N))) +#endif +#endif + +/* Extract integer from packed integer array element of X selected by + index N. */ + +#ifdef __OPTIMIZE__ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_epi8 (__m128i __X, const int __N) +{ + return (unsigned char) __builtin_ia32_vec_ext_v16qi ((__v16qi)__X, __N); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_epi32 (__m128i __X, const int __N) +{ + return __builtin_ia32_vec_ext_v4si ((__v4si)__X, __N); +} + +#ifdef __x86_64__ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_epi64 (__m128i __X, const int __N) +{ + return __builtin_ia32_vec_ext_v2di ((__v2di)__X, __N); +} +#endif +#else +#define _mm_extract_epi8(X, N) \ + ((int) (unsigned char) __builtin_ia32_vec_ext_v16qi ((__v16qi)(__m128i)(X), (int)(N))) +#define _mm_extract_epi32(X, N) \ + ((int) __builtin_ia32_vec_ext_v4si ((__v4si)(__m128i)(X), (int)(N))) + +#ifdef __x86_64__ +#define _mm_extract_epi64(X, N) \ + ((long long) __builtin_ia32_vec_ext_v2di ((__v2di)(__m128i)(X), (int)(N))) +#endif +#endif + +/* Return horizontal packed word minimum and its index in bits [15:0] + and bits [18:16] respectively. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_minpos_epu16 (__m128i __X) +{ + return (__m128i) __builtin_ia32_phminposuw128 ((__v8hi)__X); +} + +/* Packed integer sign-extension. */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi8_epi32 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovsxbd128 ((__v16qi)__X); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi16_epi32 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovsxwd128 ((__v8hi)__X); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi8_epi64 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovsxbq128 ((__v16qi)__X); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi32_epi64 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovsxdq128 ((__v4si)__X); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi16_epi64 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovsxwq128 ((__v8hi)__X); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepi8_epi16 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovsxbw128 ((__v16qi)__X); +} + +/* Packed integer zero-extension. */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu8_epi32 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovzxbd128 ((__v16qi)__X); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu16_epi32 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovzxwd128 ((__v8hi)__X); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu8_epi64 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovzxbq128 ((__v16qi)__X); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu32_epi64 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovzxdq128 ((__v4si)__X); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu16_epi64 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovzxwq128 ((__v8hi)__X); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtepu8_epi16 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pmovzxbw128 ((__v16qi)__X); +} + +/* Pack 8 double words from 2 operands into 8 words of result with + unsigned saturation. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_packus_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_packusdw128 ((__v4si)__X, (__v4si)__Y); +} + +/* Sum absolute 8-bit integer difference of adjacent groups of 4 + byte integers in the first 2 operands. Starting offsets within + operands are determined by the 3rd mask operand. */ + +#ifdef __OPTIMIZE__ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mpsadbw_epu8 (__m128i __X, __m128i __Y, const int __M) +{ + return (__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)__X, + (__v16qi)__Y, __M); +} +#else +#define _mm_mpsadbw_epu8(X, Y, M) \ + ((__m128i) __builtin_ia32_mpsadbw128 ((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (int)(M))) +#endif + +/* Load double quadword using non-temporal aligned hint. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_stream_load_si128 (__m128i *__X) +{ + return (__m128i) __builtin_ia32_movntdqa ((__v2di *) __X); +} + +#ifndef __SSE4_2__ +#pragma GCC push_options +#pragma GCC target("sse4.2") +#define __DISABLE_SSE4_2__ +#endif /* __SSE4_2__ */ + +/* These macros specify the source data format. */ +#define _SIDD_UBYTE_OPS 0x00 +#define _SIDD_UWORD_OPS 0x01 +#define _SIDD_SBYTE_OPS 0x02 +#define _SIDD_SWORD_OPS 0x03 + +/* These macros specify the comparison operation. */ +#define _SIDD_CMP_EQUAL_ANY 0x00 +#define _SIDD_CMP_RANGES 0x04 +#define _SIDD_CMP_EQUAL_EACH 0x08 +#define _SIDD_CMP_EQUAL_ORDERED 0x0c + +/* These macros specify the polarity. */ +#define _SIDD_POSITIVE_POLARITY 0x00 +#define _SIDD_NEGATIVE_POLARITY 0x10 +#define _SIDD_MASKED_POSITIVE_POLARITY 0x20 +#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30 + +/* These macros specify the output selection in _mm_cmpXstri (). */ +#define _SIDD_LEAST_SIGNIFICANT 0x00 +#define _SIDD_MOST_SIGNIFICANT 0x40 + +/* These macros specify the output selection in _mm_cmpXstrm (). */ +#define _SIDD_BIT_MASK 0x00 +#define _SIDD_UNIT_MASK 0x40 + +/* Intrinsics for text/string processing. */ + +#ifdef __OPTIMIZE__ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpistrm (__m128i __X, __m128i __Y, const int __M) +{ + return (__m128i) __builtin_ia32_pcmpistrm128 ((__v16qi)__X, + (__v16qi)__Y, + __M); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpistri (__m128i __X, __m128i __Y, const int __M) +{ + return __builtin_ia32_pcmpistri128 ((__v16qi)__X, + (__v16qi)__Y, + __M); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpestrm (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) +{ + return (__m128i) __builtin_ia32_pcmpestrm128 ((__v16qi)__X, __LX, + (__v16qi)__Y, __LY, + __M); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpestri (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) +{ + return __builtin_ia32_pcmpestri128 ((__v16qi)__X, __LX, + (__v16qi)__Y, __LY, + __M); +} +#else +#define _mm_cmpistrm(X, Y, M) \ + ((__m128i) __builtin_ia32_pcmpistrm128 ((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (int)(M))) +#define _mm_cmpistri(X, Y, M) \ + ((int) __builtin_ia32_pcmpistri128 ((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (int)(M))) + +#define _mm_cmpestrm(X, LX, Y, LY, M) \ + ((__m128i) __builtin_ia32_pcmpestrm128 ((__v16qi)(__m128i)(X), \ + (int)(LX), (__v16qi)(__m128i)(Y), \ + (int)(LY), (int)(M))) +#define _mm_cmpestri(X, LX, Y, LY, M) \ + ((int) __builtin_ia32_pcmpestri128 ((__v16qi)(__m128i)(X), (int)(LX), \ + (__v16qi)(__m128i)(Y), (int)(LY), \ + (int)(M))) +#endif + +/* Intrinsics for text/string processing and reading values of + EFlags. */ + +#ifdef __OPTIMIZE__ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpistra (__m128i __X, __m128i __Y, const int __M) +{ + return __builtin_ia32_pcmpistria128 ((__v16qi)__X, + (__v16qi)__Y, + __M); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpistrc (__m128i __X, __m128i __Y, const int __M) +{ + return __builtin_ia32_pcmpistric128 ((__v16qi)__X, + (__v16qi)__Y, + __M); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpistro (__m128i __X, __m128i __Y, const int __M) +{ + return __builtin_ia32_pcmpistrio128 ((__v16qi)__X, + (__v16qi)__Y, + __M); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpistrs (__m128i __X, __m128i __Y, const int __M) +{ + return __builtin_ia32_pcmpistris128 ((__v16qi)__X, + (__v16qi)__Y, + __M); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpistrz (__m128i __X, __m128i __Y, const int __M) +{ + return __builtin_ia32_pcmpistriz128 ((__v16qi)__X, + (__v16qi)__Y, + __M); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpestra (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) +{ + return __builtin_ia32_pcmpestria128 ((__v16qi)__X, __LX, + (__v16qi)__Y, __LY, + __M); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpestrc (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) +{ + return __builtin_ia32_pcmpestric128 ((__v16qi)__X, __LX, + (__v16qi)__Y, __LY, + __M); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpestro (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) +{ + return __builtin_ia32_pcmpestrio128 ((__v16qi)__X, __LX, + (__v16qi)__Y, __LY, + __M); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpestrs (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) +{ + return __builtin_ia32_pcmpestris128 ((__v16qi)__X, __LX, + (__v16qi)__Y, __LY, + __M); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpestrz (__m128i __X, int __LX, __m128i __Y, int __LY, const int __M) +{ + return __builtin_ia32_pcmpestriz128 ((__v16qi)__X, __LX, + (__v16qi)__Y, __LY, + __M); +} +#else +#define _mm_cmpistra(X, Y, M) \ + ((int) __builtin_ia32_pcmpistria128 ((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (int)(M))) +#define _mm_cmpistrc(X, Y, M) \ + ((int) __builtin_ia32_pcmpistric128 ((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (int)(M))) +#define _mm_cmpistro(X, Y, M) \ + ((int) __builtin_ia32_pcmpistrio128 ((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (int)(M))) +#define _mm_cmpistrs(X, Y, M) \ + ((int) __builtin_ia32_pcmpistris128 ((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (int)(M))) +#define _mm_cmpistrz(X, Y, M) \ + ((int) __builtin_ia32_pcmpistriz128 ((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (int)(M))) + +#define _mm_cmpestra(X, LX, Y, LY, M) \ + ((int) __builtin_ia32_pcmpestria128 ((__v16qi)(__m128i)(X), (int)(LX), \ + (__v16qi)(__m128i)(Y), (int)(LY), \ + (int)(M))) +#define _mm_cmpestrc(X, LX, Y, LY, M) \ + ((int) __builtin_ia32_pcmpestric128 ((__v16qi)(__m128i)(X), (int)(LX), \ + (__v16qi)(__m128i)(Y), (int)(LY), \ + (int)(M))) +#define _mm_cmpestro(X, LX, Y, LY, M) \ + ((int) __builtin_ia32_pcmpestrio128 ((__v16qi)(__m128i)(X), (int)(LX), \ + (__v16qi)(__m128i)(Y), (int)(LY), \ + (int)(M))) +#define _mm_cmpestrs(X, LX, Y, LY, M) \ + ((int) __builtin_ia32_pcmpestris128 ((__v16qi)(__m128i)(X), (int)(LX), \ + (__v16qi)(__m128i)(Y), (int)(LY), \ + (int)(M))) +#define _mm_cmpestrz(X, LX, Y, LY, M) \ + ((int) __builtin_ia32_pcmpestriz128 ((__v16qi)(__m128i)(X), (int)(LX), \ + (__v16qi)(__m128i)(Y), (int)(LY), \ + (int)(M))) +#endif + +/* Packed integer 64-bit comparison, zeroing or filling with ones + corresponding parts of result. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_epi64 (__m128i __X, __m128i __Y) +{ + return (__m128i) ((__v2di)__X > (__v2di)__Y); +} + +#ifdef __DISABLE_SSE4_2__ +#undef __DISABLE_SSE4_2__ +#pragma GCC pop_options +#endif /* __DISABLE_SSE4_2__ */ + +#ifdef __DISABLE_SSE4_1__ +#undef __DISABLE_SSE4_1__ +#pragma GCC pop_options +#endif /* __DISABLE_SSE4_1__ */ + +#include + +#ifndef __CRC32__ +#pragma GCC push_options +#pragma GCC target("crc32") +#define __DISABLE_CRC32__ +#endif /* __CRC32__ */ + +/* Accumulate CRC32 (polynomial 0x11EDC6F41) value. */ +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_crc32_u8 (unsigned int __C, unsigned char __V) +{ + return __builtin_ia32_crc32qi (__C, __V); +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_crc32_u16 (unsigned int __C, unsigned short __V) +{ + return __builtin_ia32_crc32hi (__C, __V); +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_crc32_u32 (unsigned int __C, unsigned int __V) +{ + return __builtin_ia32_crc32si (__C, __V); +} + +#ifdef __x86_64__ +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_crc32_u64 (unsigned long long __C, unsigned long long __V) +{ + return __builtin_ia32_crc32di (__C, __V); +} +#endif + +#ifdef __DISABLE_CRC32__ +#undef __DISABLE_CRC32__ +#pragma GCC pop_options +#endif /* __DISABLE_CRC32__ */ + +#endif /* _SMMINTRIN_H_INCLUDED */ diff --git a/include-gcc/tbmintrin.h b/include-gcc/tbmintrin.h new file mode 100644 index 0000000..9227f9a --- /dev/null +++ b/include-gcc/tbmintrin.h @@ -0,0 +1,180 @@ +/* Copyright (C) 2010-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _TBMINTRIN_H_INCLUDED +#define _TBMINTRIN_H_INCLUDED + +#ifndef __TBM__ +#pragma GCC push_options +#pragma GCC target("tbm") +#define __DISABLE_TBM__ +#endif /* __TBM__ */ + +#ifdef __OPTIMIZE__ +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__bextri_u32 (unsigned int __X, const unsigned int __I) +{ + return __builtin_ia32_bextri_u32 (__X, __I); +} +#else +#define __bextri_u32(X, I) \ + ((unsigned int)__builtin_ia32_bextri_u32 ((unsigned int)(X), \ + (unsigned int)(I))) +#endif /*__OPTIMIZE__ */ + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blcfill_u32 (unsigned int __X) +{ + return __X & (__X + 1); +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blci_u32 (unsigned int __X) +{ + return __X | ~(__X + 1); +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blcic_u32 (unsigned int __X) +{ + return ~__X & (__X + 1); +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blcmsk_u32 (unsigned int __X) +{ + return __X ^ (__X + 1); +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blcs_u32 (unsigned int __X) +{ + return __X | (__X + 1); +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blsfill_u32 (unsigned int __X) +{ + return __X | (__X - 1); +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blsic_u32 (unsigned int __X) +{ + return ~__X | (__X - 1); +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__t1mskc_u32 (unsigned int __X) +{ + return ~__X | (__X + 1); +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__tzmsk_u32 (unsigned int __X) +{ + return ~__X & (__X - 1); +} + + + +#ifdef __x86_64__ +#ifdef __OPTIMIZE__ +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__bextri_u64 (unsigned long long __X, const unsigned int __I) +{ + return __builtin_ia32_bextri_u64 (__X, __I); +} +#else +#define __bextri_u64(X, I) \ + ((unsigned long long)__builtin_ia32_bextri_u64 ((unsigned long long)(X), \ + (unsigned long long)(I))) +#endif /*__OPTIMIZE__ */ + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blcfill_u64 (unsigned long long __X) +{ + return __X & (__X + 1); +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blci_u64 (unsigned long long __X) +{ + return __X | ~(__X + 1); +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blcic_u64 (unsigned long long __X) +{ + return ~__X & (__X + 1); +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blcmsk_u64 (unsigned long long __X) +{ + return __X ^ (__X + 1); +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blcs_u64 (unsigned long long __X) +{ + return __X | (__X + 1); +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blsfill_u64 (unsigned long long __X) +{ + return __X | (__X - 1); +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__blsic_u64 (unsigned long long __X) +{ + return ~__X | (__X - 1); +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__t1mskc_u64 (unsigned long long __X) +{ + return ~__X | (__X + 1); +} + +extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +__tzmsk_u64 (unsigned long long __X) +{ + return ~__X & (__X - 1); +} + + +#endif /* __x86_64__ */ + +#ifdef __DISABLE_TBM__ +#undef __DISABLE_TBM__ +#pragma GCC pop_options +#endif /* __DISABLE_TBM__ */ + +#endif /* _TBMINTRIN_H_INCLUDED */ diff --git a/include-gcc/tmmintrin.h b/include-gcc/tmmintrin.h new file mode 100644 index 0000000..2df29a9 --- /dev/null +++ b/include-gcc/tmmintrin.h @@ -0,0 +1,249 @@ +/* Copyright (C) 2006-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* Implemented from the specification included in the Intel C++ Compiler + User Guide and Reference, version 9.1. */ + +#ifndef _TMMINTRIN_H_INCLUDED +#define _TMMINTRIN_H_INCLUDED + +/* We need definitions from the SSE3, SSE2 and SSE header files*/ +#include + +#ifndef __SSSE3__ +#pragma GCC push_options +#pragma GCC target("ssse3") +#define __DISABLE_SSSE3__ +#endif /* __SSSE3__ */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadd_epi16 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_phaddw128 ((__v8hi)__X, (__v8hi)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadd_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_phaddd128 ((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadds_epi16 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_phaddsw128 ((__v8hi)__X, (__v8hi)__Y); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadd_pi16 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_phaddw ((__v4hi)__X, (__v4hi)__Y); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadd_pi32 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_phaddd ((__v2si)__X, (__v2si)__Y); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadds_pi16 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_phaddsw ((__v4hi)__X, (__v4hi)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsub_epi16 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_phsubw128 ((__v8hi)__X, (__v8hi)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsub_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_phsubd128 ((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsubs_epi16 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_phsubsw128 ((__v8hi)__X, (__v8hi)__Y); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsub_pi16 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_phsubw ((__v4hi)__X, (__v4hi)__Y); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsub_pi32 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_phsubd ((__v2si)__X, (__v2si)__Y); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsubs_pi16 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_phsubsw ((__v4hi)__X, (__v4hi)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maddubs_epi16 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmaddubsw128 ((__v16qi)__X, (__v16qi)__Y); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maddubs_pi16 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_pmaddubsw ((__v8qi)__X, (__v8qi)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mulhrs_epi16 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pmulhrsw128 ((__v8hi)__X, (__v8hi)__Y); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mulhrs_pi16 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_pmulhrsw ((__v4hi)__X, (__v4hi)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_epi8 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_pshufb128 ((__v16qi)__X, (__v16qi)__Y); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_pi8 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_pshufb ((__v8qi)__X, (__v8qi)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sign_epi8 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psignb128 ((__v16qi)__X, (__v16qi)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sign_epi16 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psignw128 ((__v8hi)__X, (__v8hi)__Y); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sign_epi32 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_psignd128 ((__v4si)__X, (__v4si)__Y); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sign_pi8 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_psignb ((__v8qi)__X, (__v8qi)__Y); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sign_pi16 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_psignw ((__v4hi)__X, (__v4hi)__Y); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sign_pi32 (__m64 __X, __m64 __Y) +{ + return (__m64) __builtin_ia32_psignd ((__v2si)__X, (__v2si)__Y); +} + +#ifdef __OPTIMIZE__ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_alignr_epi8(__m128i __X, __m128i __Y, const int __N) +{ + return (__m128i) __builtin_ia32_palignr128 ((__v2di)__X, + (__v2di)__Y, __N * 8); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_alignr_pi8(__m64 __X, __m64 __Y, const int __N) +{ + return (__m64) __builtin_ia32_palignr ((__v1di)__X, + (__v1di)__Y, __N * 8); +} +#else +#define _mm_alignr_epi8(X, Y, N) \ + ((__m128i) __builtin_ia32_palignr128 ((__v2di)(__m128i)(X), \ + (__v2di)(__m128i)(Y), \ + (int)(N) * 8)) +#define _mm_alignr_pi8(X, Y, N) \ + ((__m64) __builtin_ia32_palignr ((__v1di)(__m64)(X), \ + (__v1di)(__m64)(Y), \ + (int)(N) * 8)) +#endif + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_epi8 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pabsb128 ((__v16qi)__X); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_epi16 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pabsw128 ((__v8hi)__X); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_epi32 (__m128i __X) +{ + return (__m128i) __builtin_ia32_pabsd128 ((__v4si)__X); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_pi8 (__m64 __X) +{ + return (__m64) __builtin_ia32_pabsb ((__v8qi)__X); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_pi16 (__m64 __X) +{ + return (__m64) __builtin_ia32_pabsw ((__v4hi)__X); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_abs_pi32 (__m64 __X) +{ + return (__m64) __builtin_ia32_pabsd ((__v2si)__X); +} + +#ifdef __DISABLE_SSSE3__ +#undef __DISABLE_SSSE3__ +#pragma GCC pop_options +#endif /* __DISABLE_SSSE3__ */ + +#endif /* _TMMINTRIN_H_INCLUDED */ diff --git a/include-gcc/tsxldtrkintrin.h b/include-gcc/tsxldtrkintrin.h new file mode 100644 index 0000000..c3dce59 --- /dev/null +++ b/include-gcc/tsxldtrkintrin.h @@ -0,0 +1,56 @@ +/* Copyright (C) 2020-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _TSXLDTRKINTRIN_H_INCLUDED +#define _TSXLDTRKINTRIN_H_INCLUDED + +#if !defined(__TSXLDTRK__) +#pragma GCC push_options +#pragma GCC target("tsxldtrk") +#define __DISABLE_TSXLDTRK__ +#endif /* __TSXLDTRK__ */ + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xsusldtrk (void) +{ + __builtin_ia32_xsusldtrk (); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xresldtrk (void) +{ + __builtin_ia32_xresldtrk (); +} + +#ifdef __DISABLE_TSXLDTRK__ +#undef __DISABLE_TSXLDTRK__ +#pragma GCC pop_options +#endif /* __DISABLE_TSXLDTRK__ */ + +#endif /* _TSXLDTRKINTRIN_H_INCLUDED */ diff --git a/include-gcc/uintrintrin.h b/include-gcc/uintrintrin.h new file mode 100644 index 0000000..dc5dab3 --- /dev/null +++ b/include-gcc/uintrintrin.h @@ -0,0 +1,84 @@ +/* Copyright (C) 2020-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _UINTRNTRIN_H_INCLUDED +#define _UINTRNTRIN_H_INCLUDED + +#ifdef __x86_64__ + +#ifndef __UINTR__ +#pragma GCC push_options +#pragma GCC target ("uintr") +#define __DISABLE_UINTR__ +#endif /* __UINTR__ */ + +struct __uintr_frame +{ + /* RIP of the interrupted user process. */ + unsigned long long rip; + /* RFLAGS of the interrupted user process. */ + unsigned long long rflags; + /* RSP of the interrupted user process. */ + unsigned long long rsp; +}; + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_clui (void) +{ + __builtin_ia32_clui (); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_stui (void) +{ + __builtin_ia32_stui (); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_senduipi (unsigned long long __R) +{ + __builtin_ia32_senduipi (__R); +} + +extern __inline unsigned char +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_testui (void) +{ + return __builtin_ia32_testui (); +} + +#ifdef __DISABLE_UINTR__ +#undef __DISABLE_UINTR__ +#pragma GCC pop_options +#endif /* __DISABLE_UINTR__ */ + +#endif + +#endif /* _UINTRNTRIN_H_INCLUDED. */ diff --git a/include-gcc/vaesintrin.h b/include-gcc/vaesintrin.h new file mode 100644 index 0000000..0f1cffe --- /dev/null +++ b/include-gcc/vaesintrin.h @@ -0,0 +1,111 @@ +/* Copyright (C) 2017-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef __VAESINTRIN_H_INCLUDED +#define __VAESINTRIN_H_INCLUDED + +#if !defined(__VAES__) || !defined(__AVX__) +#pragma GCC push_options +#pragma GCC target("vaes,avx") +#define __DISABLE_VAES__ +#endif /* __VAES__ */ + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_aesdec_epi128 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_vaesdec_v32qi ((__v32qi) __A, (__v32qi) __B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_aesdeclast_epi128 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_vaesdeclast_v32qi ((__v32qi) __A, + (__v32qi) __B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_aesenc_epi128 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_vaesenc_v32qi ((__v32qi) __A, (__v32qi) __B); +} + +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_aesenclast_epi128 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_vaesenclast_v32qi ((__v32qi) __A, + (__v32qi) __B); +} + +#ifdef __DISABLE_VAES__ +#undef __DISABLE_VAES__ +#pragma GCC pop_options +#endif /* __DISABLE_VAES__ */ + + +#if !defined(__VAES__) || !defined(__AVX512F__) +#pragma GCC push_options +#pragma GCC target("vaes,avx512f") +#define __DISABLE_VAESF__ +#endif /* __VAES__ */ + + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_aesdec_epi128 (__m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_vaesdec_v64qi ((__v64qi) __A, (__v64qi) __B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_aesdeclast_epi128 (__m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_vaesdeclast_v64qi ((__v64qi) __A, + (__v64qi) __B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_aesenc_epi128 (__m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_vaesenc_v64qi ((__v64qi) __A, (__v64qi) __B); +} + +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_aesenclast_epi128 (__m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_vaesenclast_v64qi ((__v64qi) __A, + (__v64qi) __B); +} + +#ifdef __DISABLE_VAESF__ +#undef __DISABLE_VAESF__ +#pragma GCC pop_options +#endif /* __DISABLE_VAES__ */ + +#endif /* __VAESINTRIN_H_INCLUDED */ diff --git a/include-gcc/vpclmulqdqintrin.h b/include-gcc/vpclmulqdqintrin.h new file mode 100644 index 0000000..ba93fc4 --- /dev/null +++ b/include-gcc/vpclmulqdqintrin.h @@ -0,0 +1,81 @@ +/* Copyright (C) 2014-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _IMMINTRIN_H_INCLUDED +#error "Never use directly; include instead." +#endif + +#ifndef _VPCLMULQDQINTRIN_H_INCLUDED +#define _VPCLMULQDQINTRIN_H_INCLUDED + +#if !defined(__VPCLMULQDQ__) || !defined(__AVX512F__) +#pragma GCC push_options +#pragma GCC target("vpclmulqdq,avx512f") +#define __DISABLE_VPCLMULQDQF__ +#endif /* __VPCLMULQDQF__ */ + +#ifdef __OPTIMIZE__ +extern __inline __m512i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm512_clmulepi64_epi128 (__m512i __A, __m512i __B, const int __C) +{ + return (__m512i) __builtin_ia32_vpclmulqdq_v8di ((__v8di)__A, + (__v8di) __B, __C); +} +#else +#define _mm512_clmulepi64_epi128(A, B, C) \ + ((__m512i) __builtin_ia32_vpclmulqdq_v8di ((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), (int)(C))) +#endif + +#ifdef __DISABLE_VPCLMULQDQF__ +#undef __DISABLE_VPCLMULQDQF__ +#pragma GCC pop_options +#endif /* __DISABLE_VPCLMULQDQF__ */ + +#if !defined(__VPCLMULQDQ__) || !defined(__AVX__) +#pragma GCC push_options +#pragma GCC target("vpclmulqdq,avx") +#define __DISABLE_VPCLMULQDQ__ +#endif /* __VPCLMULQDQ__ */ + +#ifdef __OPTIMIZE__ +extern __inline __m256i +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_clmulepi64_epi128 (__m256i __A, __m256i __B, const int __C) +{ + return (__m256i) __builtin_ia32_vpclmulqdq_v4di ((__v4di)__A, + (__v4di) __B, __C); +} +#else +#define _mm256_clmulepi64_epi128(A, B, C) \ + ((__m256i) __builtin_ia32_vpclmulqdq_v4di ((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), (int)(C))) +#endif + +#ifdef __DISABLE_VPCLMULQDQ__ +#undef __DISABLE_VPCLMULQDQ__ +#pragma GCC pop_options +#endif /* __DISABLE_VPCLMULQDQ__ */ + +#endif /* _VPCLMULQDQINTRIN_H_INCLUDED */ diff --git a/include-gcc/waitpkgintrin.h b/include-gcc/waitpkgintrin.h new file mode 100644 index 0000000..9d2f23a --- /dev/null +++ b/include-gcc/waitpkgintrin.h @@ -0,0 +1,63 @@ +/* Copyright (C) 2018-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _WAITPKG_H_INCLUDED +#define _WAITPKG_H_INCLUDED + +#ifndef __WAITPKG__ +#pragma GCC push_options +#pragma GCC target("waitpkg") +#define __DISABLE_WAITPKG__ +#endif /* __WAITPKG__ */ + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_umonitor (void *__A) +{ + __builtin_ia32_umonitor (__A); +} + +extern __inline unsigned char +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_umwait (unsigned int __A, unsigned long long __B) +{ + return __builtin_ia32_umwait (__A, __B); +} + +extern __inline unsigned char +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_tpause (unsigned int __A, unsigned long long __B) +{ + return __builtin_ia32_tpause (__A, __B); +} + +#ifdef __DISABLE_WAITPKG__ +#undef __DISABLE_WAITPKG__ +#pragma GCC pop_options +#endif /* __DISABLE_WAITPKG__ */ + +#endif /* _WAITPKG_H_INCLUDED. */ diff --git a/include-gcc/wbnoinvdintrin.h b/include-gcc/wbnoinvdintrin.h new file mode 100644 index 0000000..5d1e0ab --- /dev/null +++ b/include-gcc/wbnoinvdintrin.h @@ -0,0 +1,49 @@ +/* Copyright (C) 2018-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _WBNOINVDINTRIN_H_INCLUDED +#define _WBNOINVDINTRIN_H_INCLUDED + +#ifndef __WBNOINVD__ +#pragma GCC push_options +#pragma GCC target("wbnoinvd") +#define __DISABLE_WBNOINVD__ +#endif /* __WBNOINVD__ */ + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_wbnoinvd (void) +{ + __builtin_ia32_wbnoinvd (); +} + +#ifdef __DISABLE_WBNOINVD__ +#undef __DISABLE_WBNOINVD__ +#pragma GCC pop_options +#endif /* __DISABLE_WBNOINVD__ */ + +#endif /* _WBNOINVDINTRIN_H_INCLUDED */ diff --git a/include-gcc/wmmintrin.h b/include-gcc/wmmintrin.h new file mode 100644 index 0000000..ae15cea --- /dev/null +++ b/include-gcc/wmmintrin.h @@ -0,0 +1,132 @@ +/* Copyright (C) 2008-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* Implemented from the specification included in the Intel C++ Compiler + User Guide and Reference, version 10.1. */ + +#ifndef _WMMINTRIN_H_INCLUDED +#define _WMMINTRIN_H_INCLUDED + +/* We need definitions from the SSE2 header file. */ +#include + +/* AES */ + +#if !defined(__AES__) || !defined(__SSE2__) +#pragma GCC push_options +#pragma GCC target("aes,sse2") +#define __DISABLE_AES__ +#endif /* __AES__ */ + +/* Performs 1 round of AES decryption of the first m128i using + the second m128i as a round key. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aesdec_si128 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_aesdec128 ((__v2di)__X, (__v2di)__Y); +} + +/* Performs the last round of AES decryption of the first m128i + using the second m128i as a round key. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aesdeclast_si128 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_aesdeclast128 ((__v2di)__X, + (__v2di)__Y); +} + +/* Performs 1 round of AES encryption of the first m128i using + the second m128i as a round key. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aesenc_si128 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_aesenc128 ((__v2di)__X, (__v2di)__Y); +} + +/* Performs the last round of AES encryption of the first m128i + using the second m128i as a round key. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aesenclast_si128 (__m128i __X, __m128i __Y) +{ + return (__m128i) __builtin_ia32_aesenclast128 ((__v2di)__X, (__v2di)__Y); +} + +/* Performs the InverseMixColumn operation on the source m128i + and stores the result into m128i destination. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aesimc_si128 (__m128i __X) +{ + return (__m128i) __builtin_ia32_aesimc128 ((__v2di)__X); +} + +/* Generates a m128i round key for the input m128i AES cipher key and + byte round constant. The second parameter must be a compile time + constant. */ +#ifdef __OPTIMIZE__ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_aeskeygenassist_si128 (__m128i __X, const int __C) +{ + return (__m128i) __builtin_ia32_aeskeygenassist128 ((__v2di)__X, __C); +} +#else +#define _mm_aeskeygenassist_si128(X, C) \ + ((__m128i) __builtin_ia32_aeskeygenassist128 ((__v2di)(__m128i)(X), \ + (int)(C))) +#endif + +#ifdef __DISABLE_AES__ +#undef __DISABLE_AES__ +#pragma GCC pop_options +#endif /* __DISABLE_AES__ */ + +/* PCLMUL */ + +#if !defined(__PCLMUL__) || !defined(__SSE2__) +#pragma GCC push_options +#pragma GCC target("pclmul,sse2") +#define __DISABLE_PCLMUL__ +#endif /* __PCLMUL__ */ + +/* Performs carry-less integer multiplication of 64-bit halves of + 128-bit input operands. The third parameter inducates which 64-bit + haves of the input parameters v1 and v2 should be used. It must be + a compile time constant. */ +#ifdef __OPTIMIZE__ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_clmulepi64_si128 (__m128i __X, __m128i __Y, const int __I) +{ + return (__m128i) __builtin_ia32_pclmulqdq128 ((__v2di)__X, + (__v2di)__Y, __I); +} +#else +#define _mm_clmulepi64_si128(X, Y, I) \ + ((__m128i) __builtin_ia32_pclmulqdq128 ((__v2di)(__m128i)(X), \ + (__v2di)(__m128i)(Y), (int)(I))) +#endif + +#ifdef __DISABLE_PCLMUL__ +#undef __DISABLE_PCLMUL__ +#pragma GCC pop_options +#endif /* __DISABLE_PCLMUL__ */ + +#endif /* _WMMINTRIN_H_INCLUDED */ diff --git a/include-gcc/x86gprintrin.h b/include-gcc/x86gprintrin.h new file mode 100644 index 0000000..f41be3f --- /dev/null +++ b/include-gcc/x86gprintrin.h @@ -0,0 +1,275 @@ +/* Copyright (C) 2020-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +#define _X86GPRINTRIN_H_INCLUDED + +#if !defined _SOFT_FLOAT || defined __MMX__ || defined __SSE__ +#pragma GCC push_options +#pragma GCC target("general-regs-only") +#define __DISABLE_GENERAL_REGS_ONLY__ +#endif + +#include + +#ifndef __iamcu__ + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +#include + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_wbinvd (void) +{ + __builtin_ia32_wbinvd (); +} + +#ifndef __RDRND__ +#pragma GCC push_options +#pragma GCC target("rdrnd") +#define __DISABLE_RDRND__ +#endif /* __RDRND__ */ +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_rdrand16_step (unsigned short *__P) +{ + return __builtin_ia32_rdrand16_step (__P); +} + +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_rdrand32_step (unsigned int *__P) +{ + return __builtin_ia32_rdrand32_step (__P); +} +#ifdef __DISABLE_RDRND__ +#undef __DISABLE_RDRND__ +#pragma GCC pop_options +#endif /* __DISABLE_RDRND__ */ + +#ifndef __RDPID__ +#pragma GCC push_options +#pragma GCC target("rdpid") +#define __DISABLE_RDPID__ +#endif /* __RDPID__ */ +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_rdpid_u32 (void) +{ + return __builtin_ia32_rdpid (); +} +#ifdef __DISABLE_RDPID__ +#undef __DISABLE_RDPID__ +#pragma GCC pop_options +#endif /* __DISABLE_RDPID__ */ + +#ifdef __x86_64__ + +#ifndef __FSGSBASE__ +#pragma GCC push_options +#pragma GCC target("fsgsbase") +#define __DISABLE_FSGSBASE__ +#endif /* __FSGSBASE__ */ +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_readfsbase_u32 (void) +{ + return __builtin_ia32_rdfsbase32 (); +} + +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_readfsbase_u64 (void) +{ + return __builtin_ia32_rdfsbase64 (); +} + +extern __inline unsigned int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_readgsbase_u32 (void) +{ + return __builtin_ia32_rdgsbase32 (); +} + +extern __inline unsigned long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_readgsbase_u64 (void) +{ + return __builtin_ia32_rdgsbase64 (); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_writefsbase_u32 (unsigned int __B) +{ + __builtin_ia32_wrfsbase32 (__B); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_writefsbase_u64 (unsigned long long __B) +{ + __builtin_ia32_wrfsbase64 (__B); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_writegsbase_u32 (unsigned int __B) +{ + __builtin_ia32_wrgsbase32 (__B); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_writegsbase_u64 (unsigned long long __B) +{ + __builtin_ia32_wrgsbase64 (__B); +} +#ifdef __DISABLE_FSGSBASE__ +#undef __DISABLE_FSGSBASE__ +#pragma GCC pop_options +#endif /* __DISABLE_FSGSBASE__ */ + +#ifndef __RDRND__ +#pragma GCC push_options +#pragma GCC target("rdrnd") +#define __DISABLE_RDRND__ +#endif /* __RDRND__ */ +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_rdrand64_step (unsigned long long *__P) +{ + return __builtin_ia32_rdrand64_step (__P); +} +#ifdef __DISABLE_RDRND__ +#undef __DISABLE_RDRND__ +#pragma GCC pop_options +#endif /* __DISABLE_RDRND__ */ + +#endif /* __x86_64__ */ + +#ifndef __PTWRITE__ +#pragma GCC push_options +#pragma GCC target("ptwrite") +#define __DISABLE_PTWRITE__ +#endif + +#ifdef __x86_64__ +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_ptwrite64 (unsigned long long __B) +{ + __builtin_ia32_ptwrite64 (__B); +} +#endif /* __x86_64__ */ + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_ptwrite32 (unsigned __B) +{ + __builtin_ia32_ptwrite32 (__B); +} +#ifdef __DISABLE_PTWRITE__ +#undef __DISABLE_PTWRITE__ +#pragma GCC pop_options +#endif /* __DISABLE_PTWRITE__ */ + +#endif /* __iamcu__ */ + +#ifdef __DISABLE_GENERAL_REGS_ONLY__ +#undef __DISABLE_GENERAL_REGS_ONLY__ +#pragma GCC pop_options +#endif /* __DISABLE_GENERAL_REGS_ONLY__ */ + +#endif /* _X86GPRINTRIN_H_INCLUDED. */ diff --git a/include-gcc/x86intrin.h b/include-gcc/x86intrin.h new file mode 100644 index 0000000..ac612ce --- /dev/null +++ b/include-gcc/x86intrin.h @@ -0,0 +1,42 @@ +/* Copyright (C) 2008-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86INTRIN_H_INCLUDED +#define _X86INTRIN_H_INCLUDED + +#include + +#ifndef __iamcu__ + +/* For including AVX instructions */ +#include + +#include + +#include + +#include + +#endif /* __iamcu__ */ + +#endif /* _X86INTRIN_H_INCLUDED */ diff --git a/include-gcc/xmmintrin.h b/include-gcc/xmmintrin.h new file mode 100644 index 0000000..cb518fc --- /dev/null +++ b/include-gcc/xmmintrin.h @@ -0,0 +1,1340 @@ +/* Copyright (C) 2002-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* Implemented from the specification included in the Intel C++ Compiler + User Guide and Reference, version 9.0. */ + +#ifndef _XMMINTRIN_H_INCLUDED +#define _XMMINTRIN_H_INCLUDED + +/* We need type definitions from the MMX header file. */ +#include + +/* Get _mm_malloc () and _mm_free (). */ +#include + +/* Constants for use with _mm_prefetch. */ +enum _mm_hint +{ + _MM_HINT_IT0 = 19, + _MM_HINT_IT1 = 18, + /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */ + _MM_HINT_ET0 = 7, + _MM_HINT_ET1 = 6, + _MM_HINT_T0 = 3, + _MM_HINT_T1 = 2, + _MM_HINT_T2 = 1, + _MM_HINT_NTA = 0 +}; + +/* Loads one cache line from address P to a location "closer" to the + processor. The selector I specifies the type of prefetch operation. */ +#ifdef __OPTIMIZE__ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_prefetch (const void *__P, enum _mm_hint __I) +{ + __builtin_ia32_prefetch (__P, (__I & 0x4) >> 2, + __I & 0x3, (__I & 0x10) >> 4); +} +#else +#define _mm_prefetch(P, I) \ + __builtin_ia32_prefetch ((P), ((I) & 0x4) >> 2, ((I) & 0x3), ((I) & 0x10) >> 4) +#endif + +#ifndef __SSE__ +#pragma GCC push_options +#pragma GCC target("sse") +#define __DISABLE_SSE__ +#endif /* __SSE__ */ + +/* The Intel API is flexible enough that we must allow aliasing with other + vector types, and their scalar components. */ +typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__)); + +/* Unaligned version of the same type. */ +typedef float __m128_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1))); + +/* Internal data types for implementing the intrinsics. */ +typedef float __v4sf __attribute__ ((__vector_size__ (16))); + +/* Create a selector for use with the SHUFPS instruction. */ +#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ + (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) + +/* Bits in the MXCSR. */ +#define _MM_EXCEPT_MASK 0x003f +#define _MM_EXCEPT_INVALID 0x0001 +#define _MM_EXCEPT_DENORM 0x0002 +#define _MM_EXCEPT_DIV_ZERO 0x0004 +#define _MM_EXCEPT_OVERFLOW 0x0008 +#define _MM_EXCEPT_UNDERFLOW 0x0010 +#define _MM_EXCEPT_INEXACT 0x0020 + +#define _MM_MASK_MASK 0x1f80 +#define _MM_MASK_INVALID 0x0080 +#define _MM_MASK_DENORM 0x0100 +#define _MM_MASK_DIV_ZERO 0x0200 +#define _MM_MASK_OVERFLOW 0x0400 +#define _MM_MASK_UNDERFLOW 0x0800 +#define _MM_MASK_INEXACT 0x1000 + +#define _MM_ROUND_MASK 0x6000 +#define _MM_ROUND_NEAREST 0x0000 +#define _MM_ROUND_DOWN 0x2000 +#define _MM_ROUND_UP 0x4000 +#define _MM_ROUND_TOWARD_ZERO 0x6000 + +#define _MM_FLUSH_ZERO_MASK 0x8000 +#define _MM_FLUSH_ZERO_ON 0x8000 +#define _MM_FLUSH_ZERO_OFF 0x0000 + +/* Create an undefined vector. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_undefined_ps (void) +{ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Winit-self" + __m128 __Y = __Y; +#pragma GCC diagnostic pop + return __Y; +} + +/* Create a vector of zeros. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setzero_ps (void) +{ + return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f }; +} + +/* Perform the respective operation on the lower SPFP (single-precision + floating-point) values of A and B; the upper three SPFP values are + passed through from A. */ + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_addss ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_subss ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_mulss ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_div_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_divss ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sqrt_ss (__m128 __A) +{ + return (__m128) __builtin_ia32_sqrtss ((__v4sf)__A); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rcp_ss (__m128 __A) +{ + return (__m128) __builtin_ia32_rcpss ((__v4sf)__A); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rsqrt_ss (__m128 __A) +{ + return (__m128) __builtin_ia32_rsqrtss ((__v4sf)__A); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_minss ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_maxss ((__v4sf)__A, (__v4sf)__B); +} + +/* Perform the respective operation on the four SPFP values in A and B. */ + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_add_ps (__m128 __A, __m128 __B) +{ + return (__m128) ((__v4sf)__A + (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sub_ps (__m128 __A, __m128 __B) +{ + return (__m128) ((__v4sf)__A - (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mul_ps (__m128 __A, __m128 __B) +{ + return (__m128) ((__v4sf)__A * (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_div_ps (__m128 __A, __m128 __B) +{ + return (__m128) ((__v4sf)__A / (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sqrt_ps (__m128 __A) +{ + return (__m128) __builtin_ia32_sqrtps ((__v4sf)__A); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rcp_ps (__m128 __A) +{ + return (__m128) __builtin_ia32_rcpps ((__v4sf)__A); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rsqrt_ps (__m128 __A) +{ + return (__m128) __builtin_ia32_rsqrtps ((__v4sf)__A); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_minps ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_maxps ((__v4sf)__A, (__v4sf)__B); +} + +/* Perform logical bit-wise operations on 128-bit values. */ + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_and_ps (__m128 __A, __m128 __B) +{ + return __builtin_ia32_andps (__A, __B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_andnot_ps (__m128 __A, __m128 __B) +{ + return __builtin_ia32_andnps (__A, __B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_or_ps (__m128 __A, __m128 __B) +{ + return __builtin_ia32_orps (__A, __B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_xor_ps (__m128 __A, __m128 __B) +{ + return __builtin_ia32_xorps (__A, __B); +} + +/* Perform a comparison on the lower SPFP values of A and B. If the + comparison is true, place a mask of all ones in the result, otherwise a + mask of zeros. The upper three SPFP values are passed through from A. */ + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpeqss ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpltss ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpless ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_movss ((__v4sf) __A, + (__v4sf) + __builtin_ia32_cmpltss ((__v4sf) __B, + (__v4sf) + __A)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_movss ((__v4sf) __A, + (__v4sf) + __builtin_ia32_cmpless ((__v4sf) __B, + (__v4sf) + __A)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpneqss ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnlt_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpnltss ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnle_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpnless ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpngt_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_movss ((__v4sf) __A, + (__v4sf) + __builtin_ia32_cmpnltss ((__v4sf) __B, + (__v4sf) + __A)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnge_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_movss ((__v4sf) __A, + (__v4sf) + __builtin_ia32_cmpnless ((__v4sf) __B, + (__v4sf) + __A)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpord_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpordss ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpunord_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpunordss ((__v4sf)__A, (__v4sf)__B); +} + +/* Perform a comparison on the four SPFP values of A and B. For each + element, if the comparison is true, place a mask of all ones in the + result, otherwise a mask of zeros. */ + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpeq_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpeqps ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmplt_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpltps ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmple_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpleps ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpgt_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpgtps ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpge_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpgeps ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpneq_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpneqps ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnlt_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpnltps ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnle_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpnleps ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpngt_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpngtps ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpnge_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpngeps ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpord_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpordps ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmpunord_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_cmpunordps ((__v4sf)__A, (__v4sf)__B); +} + +/* Compare the lower SPFP values of A and B and return 1 if true + and 0 if false. */ + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comieq_ss (__m128 __A, __m128 __B) +{ + return __builtin_ia32_comieq ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comilt_ss (__m128 __A, __m128 __B) +{ + return __builtin_ia32_comilt ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comile_ss (__m128 __A, __m128 __B) +{ + return __builtin_ia32_comile ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comigt_ss (__m128 __A, __m128 __B) +{ + return __builtin_ia32_comigt ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comige_ss (__m128 __A, __m128 __B) +{ + return __builtin_ia32_comige ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comineq_ss (__m128 __A, __m128 __B) +{ + return __builtin_ia32_comineq ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomieq_ss (__m128 __A, __m128 __B) +{ + return __builtin_ia32_ucomieq ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomilt_ss (__m128 __A, __m128 __B) +{ + return __builtin_ia32_ucomilt ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomile_ss (__m128 __A, __m128 __B) +{ + return __builtin_ia32_ucomile ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomigt_ss (__m128 __A, __m128 __B) +{ + return __builtin_ia32_ucomigt ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomige_ss (__m128 __A, __m128 __B) +{ + return __builtin_ia32_ucomige ((__v4sf)__A, (__v4sf)__B); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_ucomineq_ss (__m128 __A, __m128 __B) +{ + return __builtin_ia32_ucomineq ((__v4sf)__A, (__v4sf)__B); +} + +/* Convert the lower SPFP value to a 32-bit integer according to the current + rounding mode. */ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_si32 (__m128 __A) +{ + return __builtin_ia32_cvtss2si ((__v4sf) __A); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_ss2si (__m128 __A) +{ + return _mm_cvtss_si32 (__A); +} + +#ifdef __x86_64__ +/* Convert the lower SPFP value to a 32-bit integer according to the + current rounding mode. */ + +/* Intel intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_si64 (__m128 __A) +{ + return __builtin_ia32_cvtss2si64 ((__v4sf) __A); +} + +/* Microsoft intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_si64x (__m128 __A) +{ + return __builtin_ia32_cvtss2si64 ((__v4sf) __A); +} +#endif + +/* Convert the two lower SPFP values to 32-bit integers according to the + current rounding mode. Return the integers in packed form. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtps_pi32 (__m128 __A) +{ + return (__m64) __builtin_ia32_cvtps2pi ((__v4sf) __A); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_ps2pi (__m128 __A) +{ + return _mm_cvtps_pi32 (__A); +} + +/* Truncate the lower SPFP value to a 32-bit integer. */ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttss_si32 (__m128 __A) +{ + return __builtin_ia32_cvttss2si ((__v4sf) __A); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_ss2si (__m128 __A) +{ + return _mm_cvttss_si32 (__A); +} + +#ifdef __x86_64__ +/* Truncate the lower SPFP value to a 32-bit integer. */ + +/* Intel intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttss_si64 (__m128 __A) +{ + return __builtin_ia32_cvttss2si64 ((__v4sf) __A); +} + +/* Microsoft intrinsic. */ +extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttss_si64x (__m128 __A) +{ + return __builtin_ia32_cvttss2si64 ((__v4sf) __A); +} +#endif + +/* Truncate the two lower SPFP values to 32-bit integers. Return the + integers in packed form. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvttps_pi32 (__m128 __A) +{ + return (__m64) __builtin_ia32_cvttps2pi ((__v4sf) __A); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtt_ps2pi (__m128 __A) +{ + return _mm_cvttps_pi32 (__A); +} + +/* Convert B to a SPFP value and insert it as element zero in A. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi32_ss (__m128 __A, int __B) +{ + return (__m128) __builtin_ia32_cvtsi2ss ((__v4sf) __A, __B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_si2ss (__m128 __A, int __B) +{ + return _mm_cvtsi32_ss (__A, __B); +} + +#ifdef __x86_64__ +/* Convert B to a SPFP value and insert it as element zero in A. */ + +/* Intel intrinsic. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64_ss (__m128 __A, long long __B) +{ + return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B); +} + +/* Microsoft intrinsic. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtsi64x_ss (__m128 __A, long long __B) +{ + return (__m128) __builtin_ia32_cvtsi642ss ((__v4sf) __A, __B); +} +#endif + +/* Convert the two 32-bit values in B to SPFP form and insert them + as the two lower elements in A. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpi32_ps (__m128 __A, __m64 __B) +{ + return (__m128) __builtin_ia32_cvtpi2ps ((__v4sf) __A, (__v2si)__B); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvt_pi2ps (__m128 __A, __m64 __B) +{ + return _mm_cvtpi32_ps (__A, __B); +} + +/* Convert the four signed 16-bit values in A to SPFP form. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpi16_ps (__m64 __A) +{ + __v4hi __sign; + __v2si __hisi, __losi; + __v4sf __zero, __ra, __rb; + + /* This comparison against zero gives us a mask that can be used to + fill in the missing sign bits in the unpack operations below, so + that we get signed values after unpacking. */ + __sign = __builtin_ia32_pcmpgtw ((__v4hi)0LL, (__v4hi)__A); + + /* Convert the four words to doublewords. */ + __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, __sign); + __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, __sign); + + /* Convert the doublewords to floating point two at a time. */ + __zero = (__v4sf) _mm_setzero_ps (); + __ra = __builtin_ia32_cvtpi2ps (__zero, __losi); + __rb = __builtin_ia32_cvtpi2ps (__ra, __hisi); + + return (__m128) __builtin_ia32_movlhps (__ra, __rb); +} + +/* Convert the four unsigned 16-bit values in A to SPFP form. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpu16_ps (__m64 __A) +{ + __v2si __hisi, __losi; + __v4sf __zero, __ra, __rb; + + /* Convert the four words to doublewords. */ + __losi = (__v2si) __builtin_ia32_punpcklwd ((__v4hi)__A, (__v4hi)0LL); + __hisi = (__v2si) __builtin_ia32_punpckhwd ((__v4hi)__A, (__v4hi)0LL); + + /* Convert the doublewords to floating point two at a time. */ + __zero = (__v4sf) _mm_setzero_ps (); + __ra = __builtin_ia32_cvtpi2ps (__zero, __losi); + __rb = __builtin_ia32_cvtpi2ps (__ra, __hisi); + + return (__m128) __builtin_ia32_movlhps (__ra, __rb); +} + +/* Convert the low four signed 8-bit values in A to SPFP form. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpi8_ps (__m64 __A) +{ + __v8qi __sign; + + /* This comparison against zero gives us a mask that can be used to + fill in the missing sign bits in the unpack operations below, so + that we get signed values after unpacking. */ + __sign = __builtin_ia32_pcmpgtb ((__v8qi)0LL, (__v8qi)__A); + + /* Convert the four low bytes to words. */ + __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, __sign); + + return _mm_cvtpi16_ps(__A); +} + +/* Convert the low four unsigned 8-bit values in A to SPFP form. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpu8_ps(__m64 __A) +{ + __A = (__m64) __builtin_ia32_punpcklbw ((__v8qi)__A, (__v8qi)0LL); + return _mm_cvtpu16_ps(__A); +} + +/* Convert the four signed 32-bit values in A and B to SPFP form. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtpi32x2_ps(__m64 __A, __m64 __B) +{ + __v4sf __zero = (__v4sf) _mm_setzero_ps (); + __v4sf __sfa = __builtin_ia32_cvtpi2ps (__zero, (__v2si)__A); + __v4sf __sfb = __builtin_ia32_cvtpi2ps (__sfa, (__v2si)__B); + return (__m128) __builtin_ia32_movlhps (__sfa, __sfb); +} + +/* Convert the four SPFP values in A to four signed 16-bit integers. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtps_pi16(__m128 __A) +{ + __v4sf __hisf = (__v4sf)__A; + __v4sf __losf = __builtin_ia32_movhlps (__hisf, __hisf); + __v2si __hisi = __builtin_ia32_cvtps2pi (__hisf); + __v2si __losi = __builtin_ia32_cvtps2pi (__losf); + return (__m64) __builtin_ia32_packssdw (__hisi, __losi); +} + +/* Convert the four SPFP values in A to four signed 8-bit integers. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtps_pi8(__m128 __A) +{ + __v4hi __tmp = (__v4hi) _mm_cvtps_pi16 (__A); + return (__m64) __builtin_ia32_packsswb (__tmp, (__v4hi)0LL); +} + +/* Selects four specific SPFP values from A and B based on MASK. */ +#ifdef __OPTIMIZE__ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask) +{ + return (__m128) __builtin_ia32_shufps ((__v4sf)__A, (__v4sf)__B, __mask); +} +#else +#define _mm_shuffle_ps(A, B, MASK) \ + ((__m128) __builtin_ia32_shufps ((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(MASK))) +#endif + +/* Selects and interleaves the upper two SPFP values from A and B. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpackhi_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_unpckhps ((__v4sf)__A, (__v4sf)__B); +} + +/* Selects and interleaves the lower two SPFP values from A and B. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_unpacklo_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_unpcklps ((__v4sf)__A, (__v4sf)__B); +} + +/* Sets the upper two SPFP values with 64-bits of data loaded from P; + the lower two values are passed through from A. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadh_pi (__m128 __A, __m64 const *__P) +{ + return (__m128) __builtin_ia32_loadhps ((__v4sf)__A, (const __v2sf *)__P); +} + +/* Stores the upper two SPFP values of A into P. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeh_pi (__m64 *__P, __m128 __A) +{ + __builtin_ia32_storehps ((__v2sf *)__P, (__v4sf)__A); +} + +/* Moves the upper two values of B into the lower two values of A. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movehl_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_movhlps ((__v4sf)__A, (__v4sf)__B); +} + +/* Moves the lower two values of B into the upper two values of A. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movelh_ps (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_movlhps ((__v4sf)__A, (__v4sf)__B); +} + +/* Sets the lower two SPFP values with 64-bits of data loaded from P; + the upper two values are passed through from A. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadl_pi (__m128 __A, __m64 const *__P) +{ + return (__m128) __builtin_ia32_loadlps ((__v4sf)__A, (const __v2sf *)__P); +} + +/* Stores the lower two SPFP values of A into P. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storel_pi (__m64 *__P, __m128 __A) +{ + __builtin_ia32_storelps ((__v2sf *)__P, (__v4sf)__A); +} + +/* Creates a 4-bit mask from the most significant bits of the SPFP values. */ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movemask_ps (__m128 __A) +{ + return __builtin_ia32_movmskps ((__v4sf)__A); +} + +/* Return the contents of the control register. */ +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_getcsr (void) +{ + return __builtin_ia32_stmxcsr (); +} + +/* Read exception bits from the control register. */ +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_MM_GET_EXCEPTION_STATE (void) +{ + return _mm_getcsr() & _MM_EXCEPT_MASK; +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_MM_GET_EXCEPTION_MASK (void) +{ + return _mm_getcsr() & _MM_MASK_MASK; +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_MM_GET_ROUNDING_MODE (void) +{ + return _mm_getcsr() & _MM_ROUND_MASK; +} + +extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_MM_GET_FLUSH_ZERO_MODE (void) +{ + return _mm_getcsr() & _MM_FLUSH_ZERO_MASK; +} + +/* Set the control register to I. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setcsr (unsigned int __I) +{ + __builtin_ia32_ldmxcsr (__I); +} + +/* Set exception bits in the control register. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_MM_SET_EXCEPTION_STATE(unsigned int __mask) +{ + _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_MM_SET_EXCEPTION_MASK (unsigned int __mask) +{ + _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_MM_SET_ROUNDING_MODE (unsigned int __mode) +{ + _mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | __mode); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_MM_SET_FLUSH_ZERO_MODE (unsigned int __mode) +{ + _mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | __mode); +} + +/* Create a vector with element 0 as F and the rest zero. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_ss (float __F) +{ + return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f }; +} + +/* Create a vector with all four elements equal to F. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set1_ps (float __F) +{ + return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F }; +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_ps1 (float __F) +{ + return _mm_set1_ps (__F); +} + +/* Create a vector with element 0 as *P and the rest zero. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_ss (float const *__P) +{ + return _mm_set_ss (*__P); +} + +/* Create a vector with all four elements equal to *P. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load1_ps (float const *__P) +{ + return _mm_set1_ps (*__P); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_ps1 (float const *__P) +{ + return _mm_load1_ps (__P); +} + +/* Load four SPFP values from P. The address must be 16-byte aligned. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_load_ps (float const *__P) +{ + return *(__m128 *)__P; +} + +/* Load four SPFP values from P. The address need not be 16-byte aligned. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadu_ps (float const *__P) +{ + return *(__m128_u *)__P; +} + +/* Load four SPFP values in reverse order. The address must be aligned. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loadr_ps (float const *__P) +{ + __v4sf __tmp = *(__v4sf *)__P; + return (__m128) __builtin_ia32_shufps (__tmp, __tmp, _MM_SHUFFLE (0,1,2,3)); +} + +/* Create the vector [Z Y X W]. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W) +{ + return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z }; +} + +/* Create the vector [W X Y Z]. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_setr_ps (float __Z, float __Y, float __X, float __W) +{ + return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W }; +} + +/* Stores the lower SPFP value. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_ss (float *__P, __m128 __A) +{ + *__P = ((__v4sf)__A)[0]; +} + +extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cvtss_f32 (__m128 __A) +{ + return ((__v4sf)__A)[0]; +} + +/* Store four SPFP values. The address must be 16-byte aligned. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_ps (float *__P, __m128 __A) +{ + *(__m128 *)__P = __A; +} + +/* Store four SPFP values. The address need not be 16-byte aligned. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storeu_ps (float *__P, __m128 __A) +{ + *(__m128_u *)__P = __A; +} + +/* Store the lower SPFP value across four words. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store1_ps (float *__P, __m128 __A) +{ + __v4sf __va = (__v4sf)__A; + __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,0,0,0)); + _mm_storeu_ps (__P, __tmp); +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_store_ps1 (float *__P, __m128 __A) +{ + _mm_store1_ps (__P, __A); +} + +/* Store four SPFP values in reverse order. The address must be aligned. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_storer_ps (float *__P, __m128 __A) +{ + __v4sf __va = (__v4sf)__A; + __v4sf __tmp = __builtin_ia32_shufps (__va, __va, _MM_SHUFFLE (0,1,2,3)); + _mm_store_ps (__P, __tmp); +} + +/* Sets the low SPFP value of A from the low value of B. */ +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_move_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_shuffle ((__v4sf)__A, (__v4sf)__B, + __extension__ + (__attribute__((__vector_size__ (16))) int) + {4,1,2,3}); +} + +/* Extracts one of the four words of A. The selector N must be immediate. */ +#ifdef __OPTIMIZE__ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_extract_pi16 (__m64 const __A, int const __N) +{ + return (unsigned short) __builtin_ia32_vec_ext_v4hi ((__v4hi)__A, __N); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pextrw (__m64 const __A, int const __N) +{ + return _mm_extract_pi16 (__A, __N); +} +#else +#define _mm_extract_pi16(A, N) \ + ((int) (unsigned short) __builtin_ia32_vec_ext_v4hi ((__v4hi)(__m64)(A), (int)(N))) + +#define _m_pextrw(A, N) _mm_extract_pi16(A, N) +#endif + +/* Inserts word D into one of four words of A. The selector N must be + immediate. */ +#ifdef __OPTIMIZE__ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_insert_pi16 (__m64 const __A, int const __D, int const __N) +{ + return (__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)__A, __D, __N); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pinsrw (__m64 const __A, int const __D, int const __N) +{ + return _mm_insert_pi16 (__A, __D, __N); +} +#else +#define _mm_insert_pi16(A, D, N) \ + ((__m64) __builtin_ia32_vec_set_v4hi ((__v4hi)(__m64)(A), \ + (int)(D), (int)(N))) + +#define _m_pinsrw(A, D, N) _mm_insert_pi16(A, D, N) +#endif + +/* Compute the element-wise maximum of signed 16-bit values. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_pi16 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_ia32_pmaxsw ((__v4hi)__A, (__v4hi)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pmaxsw (__m64 __A, __m64 __B) +{ + return _mm_max_pi16 (__A, __B); +} + +/* Compute the element-wise maximum of unsigned 8-bit values. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_max_pu8 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_ia32_pmaxub ((__v8qi)__A, (__v8qi)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pmaxub (__m64 __A, __m64 __B) +{ + return _mm_max_pu8 (__A, __B); +} + +/* Compute the element-wise minimum of signed 16-bit values. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_pi16 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_ia32_pminsw ((__v4hi)__A, (__v4hi)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pminsw (__m64 __A, __m64 __B) +{ + return _mm_min_pi16 (__A, __B); +} + +/* Compute the element-wise minimum of unsigned 8-bit values. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_min_pu8 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_ia32_pminub ((__v8qi)__A, (__v8qi)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pminub (__m64 __A, __m64 __B) +{ + return _mm_min_pu8 (__A, __B); +} + +/* Create an 8-bit mask of the signs of 8-bit values. */ +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movemask_pi8 (__m64 __A) +{ + return __builtin_ia32_pmovmskb ((__v8qi)__A); +} + +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pmovmskb (__m64 __A) +{ + return _mm_movemask_pi8 (__A); +} + +/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values + in B and produce the high 16 bits of the 32-bit results. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_mulhi_pu16 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_ia32_pmulhuw ((__v4hi)__A, (__v4hi)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pmulhuw (__m64 __A, __m64 __B) +{ + return _mm_mulhi_pu16 (__A, __B); +} + +/* Return a combination of the four 16-bit values in A. The selector + must be an immediate. */ +#ifdef __OPTIMIZE__ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shuffle_pi16 (__m64 __A, int const __N) +{ + return (__m64) __builtin_ia32_pshufw ((__v4hi)__A, __N); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pshufw (__m64 __A, int const __N) +{ + return _mm_shuffle_pi16 (__A, __N); +} +#else +#define _mm_shuffle_pi16(A, N) \ + ((__m64) __builtin_ia32_pshufw ((__v4hi)(__m64)(A), (int)(N))) + +#define _m_pshufw(A, N) _mm_shuffle_pi16 (A, N) +#endif + +/* Conditionally store byte elements of A into P. The high bit of each + byte in the selector N determines whether the corresponding byte from + A is stored. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P) +{ +#ifdef __MMX_WITH_SSE__ + /* Emulate MMX maskmovq with SSE2 maskmovdqu and handle unmapped bits + 64:127 at address __P. */ + typedef long long __v2di __attribute__ ((__vector_size__ (16))); + typedef char __v16qi __attribute__ ((__vector_size__ (16))); + /* Zero-extend __A and __N to 128 bits. */ + __v2di __A128 = __extension__ (__v2di) { ((__v1di) __A)[0], 0 }; + __v2di __N128 = __extension__ (__v2di) { ((__v1di) __N)[0], 0 }; + + /* Check the alignment of __P. */ + __SIZE_TYPE__ offset = ((__SIZE_TYPE__) __P) & 0xf; + if (offset) + { + /* If the misalignment of __P > 8, subtract __P by 8 bytes. + Otherwise, subtract __P by the misalignment. */ + if (offset > 8) + offset = 8; + __P = (char *) (((__SIZE_TYPE__) __P) - offset); + + /* Shift __A128 and __N128 to the left by the adjustment. */ + switch (offset) + { + case 1: + __A128 = __builtin_ia32_pslldqi128 (__A128, 8); + __N128 = __builtin_ia32_pslldqi128 (__N128, 8); + break; + case 2: + __A128 = __builtin_ia32_pslldqi128 (__A128, 2 * 8); + __N128 = __builtin_ia32_pslldqi128 (__N128, 2 * 8); + break; + case 3: + __A128 = __builtin_ia32_pslldqi128 (__A128, 3 * 8); + __N128 = __builtin_ia32_pslldqi128 (__N128, 3 * 8); + break; + case 4: + __A128 = __builtin_ia32_pslldqi128 (__A128, 4 * 8); + __N128 = __builtin_ia32_pslldqi128 (__N128, 4 * 8); + break; + case 5: + __A128 = __builtin_ia32_pslldqi128 (__A128, 5 * 8); + __N128 = __builtin_ia32_pslldqi128 (__N128, 5 * 8); + break; + case 6: + __A128 = __builtin_ia32_pslldqi128 (__A128, 6 * 8); + __N128 = __builtin_ia32_pslldqi128 (__N128, 6 * 8); + break; + case 7: + __A128 = __builtin_ia32_pslldqi128 (__A128, 7 * 8); + __N128 = __builtin_ia32_pslldqi128 (__N128, 7 * 8); + break; + case 8: + __A128 = __builtin_ia32_pslldqi128 (__A128, 8 * 8); + __N128 = __builtin_ia32_pslldqi128 (__N128, 8 * 8); + break; + default: + break; + } + } + __builtin_ia32_maskmovdqu ((__v16qi)__A128, (__v16qi)__N128, __P); +#else + __builtin_ia32_maskmovq ((__v8qi)__A, (__v8qi)__N, __P); +#endif +} + +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_maskmovq (__m64 __A, __m64 __N, char *__P) +{ + _mm_maskmove_si64 (__A, __N, __P); +} + +/* Compute the rounded averages of the unsigned 8-bit values in A and B. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_avg_pu8 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_ia32_pavgb ((__v8qi)__A, (__v8qi)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pavgb (__m64 __A, __m64 __B) +{ + return _mm_avg_pu8 (__A, __B); +} + +/* Compute the rounded averages of the unsigned 16-bit values in A and B. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_avg_pu16 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_ia32_pavgw ((__v4hi)__A, (__v4hi)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_pavgw (__m64 __A, __m64 __B) +{ + return _mm_avg_pu16 (__A, __B); +} + +/* Compute the sum of the absolute differences of the unsigned 8-bit + values in A and B. Return the value in the lower 16-bit word; the + upper words are cleared. */ +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sad_pu8 (__m64 __A, __m64 __B) +{ + return (__m64) __builtin_ia32_psadbw ((__v8qi)__A, (__v8qi)__B); +} + +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_m_psadbw (__m64 __A, __m64 __B) +{ + return _mm_sad_pu8 (__A, __B); +} + +/* Stores the data in A to the address P without polluting the caches. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_stream_pi (__m64 *__P, __m64 __A) +{ + __builtin_ia32_movntq ((unsigned long long *)__P, (unsigned long long)__A); +} + +/* Likewise. The address must be 16-byte aligned. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_stream_ps (float *__P, __m128 __A) +{ + __builtin_ia32_movntps (__P, (__v4sf)__A); +} + +/* Guarantees that every preceding store is globally visible before + any subsequent store. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sfence (void) +{ + __builtin_ia32_sfence (); +} + +/* Transpose the 4x4 matrix composed of row[0-3]. */ +#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ +do { \ + __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \ + __v4sf __t0 = __builtin_ia32_unpcklps (__r0, __r1); \ + __v4sf __t1 = __builtin_ia32_unpcklps (__r2, __r3); \ + __v4sf __t2 = __builtin_ia32_unpckhps (__r0, __r1); \ + __v4sf __t3 = __builtin_ia32_unpckhps (__r2, __r3); \ + (row0) = __builtin_ia32_movlhps (__t0, __t1); \ + (row1) = __builtin_ia32_movhlps (__t1, __t0); \ + (row2) = __builtin_ia32_movlhps (__t2, __t3); \ + (row3) = __builtin_ia32_movhlps (__t3, __t2); \ +} while (0) + +/* For backward source compatibility. */ +# include + +#ifdef __DISABLE_SSE__ +#undef __DISABLE_SSE__ +#pragma GCC pop_options +#endif /* __DISABLE_SSE__ */ + +/* The execution of the next instruction is delayed by an implementation + specific amount of time. The instruction does not modify the + architectural state. This is after the pop_options pragma because + it does not require SSE support in the processor--the encoding is a + nop on processors that do not support it. */ +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_pause (void) +{ + __builtin_ia32_pause (); +} + +#endif /* _XMMINTRIN_H_INCLUDED */ diff --git a/include-gcc/xopintrin.h b/include-gcc/xopintrin.h new file mode 100644 index 0000000..39a03bf --- /dev/null +++ b/include-gcc/xopintrin.h @@ -0,0 +1,850 @@ +/* Copyright (C) 2007-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86INTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _XOPMMINTRIN_H_INCLUDED +#define _XOPMMINTRIN_H_INCLUDED + +#include + +#ifndef __XOP__ +#pragma GCC push_options +#pragma GCC target("xop") +#define __DISABLE_XOP__ +#endif /* __XOP__ */ + +/* Integer multiply/add instructions. */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmacssww ((__v8hi)__A,(__v8hi)__B, (__v8hi)__C); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_macc_epi16(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmacsww ((__v8hi)__A, (__v8hi)__B, (__v8hi)__C); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maccsd_epi16(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmacsswd ((__v8hi)__A, (__v8hi)__B, (__v4si)__C); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maccd_epi16(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmacswd ((__v8hi)__A, (__v8hi)__B, (__v4si)__C); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maccs_epi32(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmacssdd ((__v4si)__A, (__v4si)__B, (__v4si)__C); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_macc_epi32(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmacsdd ((__v4si)__A, (__v4si)__B, (__v4si)__C); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maccslo_epi32(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmacssdql ((__v4si)__A, (__v4si)__B, (__v2di)__C); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_macclo_epi32(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmacsdql ((__v4si)__A, (__v4si)__B, (__v2di)__C); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maccshi_epi32(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmacssdqh ((__v4si)__A, (__v4si)__B, (__v2di)__C); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_macchi_epi32(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmacsdqh ((__v4si)__A, (__v4si)__B, (__v2di)__C); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maddsd_epi16(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmadcsswd ((__v8hi)__A,(__v8hi)__B,(__v4si)__C); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_maddd_epi16(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpmadcswd ((__v8hi)__A,(__v8hi)__B,(__v4si)__C); +} + +/* Packed Integer Horizontal Add and Subtract */ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddw_epi8(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphaddbw ((__v16qi)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddd_epi8(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphaddbd ((__v16qi)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddq_epi8(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphaddbq ((__v16qi)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddd_epi16(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphaddwd ((__v8hi)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddq_epi16(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphaddwq ((__v8hi)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddq_epi32(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphadddq ((__v4si)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddw_epu8(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphaddubw ((__v16qi)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddd_epu8(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphaddubd ((__v16qi)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddq_epu8(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphaddubq ((__v16qi)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddd_epu16(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphadduwd ((__v8hi)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddq_epu16(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphadduwq ((__v8hi)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_haddq_epu32(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphaddudq ((__v4si)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsubw_epi8(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphsubbw ((__v16qi)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsubd_epi16(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphsubwd ((__v8hi)__A); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsubq_epi32(__m128i __A) +{ + return (__m128i) __builtin_ia32_vphsubdq ((__v4si)__A); +} + +/* Vector conditional move and permute */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpcmov (__A, __B, __C); +} + +extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i) __builtin_ia32_vpcmov256 (__A, __B, __C); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_perm_epi8(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i) __builtin_ia32_vpperm ((__v16qi)__A, (__v16qi)__B, (__v16qi)__C); +} + +/* Packed Integer Rotates and Shifts + Rotates - Non-Immediate form */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rot_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vprotb ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rot_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vprotw ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rot_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vprotd ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_rot_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vprotq ((__v2di)__A, (__v2di)__B); +} + +/* Rotates - Immediate form */ + +#ifdef __OPTIMIZE__ +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roti_epi8(__m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_vprotbi ((__v16qi)__A, __B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roti_epi16(__m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_vprotwi ((__v8hi)__A, __B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roti_epi32(__m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_vprotdi ((__v4si)__A, __B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_roti_epi64(__m128i __A, const int __B) +{ + return (__m128i) __builtin_ia32_vprotqi ((__v2di)__A, __B); +} +#else +#define _mm_roti_epi8(A, N) \ + ((__m128i) __builtin_ia32_vprotbi ((__v16qi)(__m128i)(A), (int)(N))) +#define _mm_roti_epi16(A, N) \ + ((__m128i) __builtin_ia32_vprotwi ((__v8hi)(__m128i)(A), (int)(N))) +#define _mm_roti_epi32(A, N) \ + ((__m128i) __builtin_ia32_vprotdi ((__v4si)(__m128i)(A), (int)(N))) +#define _mm_roti_epi64(A, N) \ + ((__m128i) __builtin_ia32_vprotqi ((__v2di)(__m128i)(A), (int)(N))) +#endif + +/* Shifts */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shl_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshlb ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shl_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshlw ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shl_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshld ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_shl_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshlq ((__v2di)__A, (__v2di)__B); +} + + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshab ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshaw ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshad ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_sha_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpshaq ((__v2di)__A, (__v2di)__B); +} + +/* Compare and Predicate Generation + pcom (integer, unsigned bytes) */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comlt_epu8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomltub ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comle_epu8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomleub ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comgt_epu8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgtub ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comge_epu8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgeub ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comeq_epu8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomequb ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comneq_epu8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomnequb ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comfalse_epu8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomfalseub ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comtrue_epu8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomtrueub ((__v16qi)__A, (__v16qi)__B); +} + +/*pcom (integer, unsigned words) */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comlt_epu16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomltuw ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comle_epu16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomleuw ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comgt_epu16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgtuw ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comge_epu16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgeuw ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comeq_epu16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomequw ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comneq_epu16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomnequw ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comfalse_epu16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomfalseuw ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comtrue_epu16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomtrueuw ((__v8hi)__A, (__v8hi)__B); +} + +/*pcom (integer, unsigned double words) */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comlt_epu32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomltud ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comle_epu32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomleud ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comgt_epu32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgtud ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comge_epu32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgeud ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comeq_epu32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomequd ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comneq_epu32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomnequd ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comfalse_epu32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomfalseud ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comtrue_epu32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomtrueud ((__v4si)__A, (__v4si)__B); +} + +/*pcom (integer, unsigned quad words) */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comlt_epu64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomltuq ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comle_epu64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomleuq ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comgt_epu64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgtuq ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comge_epu64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgeuq ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comeq_epu64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomequq ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comneq_epu64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomnequq ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comfalse_epu64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomfalseuq ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comtrue_epu64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomtrueuq ((__v2di)__A, (__v2di)__B); +} + +/*pcom (integer, signed bytes) */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comlt_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomltb ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comle_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomleb ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comgt_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgtb ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comge_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgeb ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comeq_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomeqb ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comneq_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomneqb ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comfalse_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomfalseb ((__v16qi)__A, (__v16qi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comtrue_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomtrueb ((__v16qi)__A, (__v16qi)__B); +} + +/*pcom (integer, signed words) */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comlt_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomltw ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comle_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomlew ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comgt_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgtw ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comge_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgew ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comeq_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomeqw ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comneq_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomneqw ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comfalse_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomfalsew ((__v8hi)__A, (__v8hi)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comtrue_epi16(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomtruew ((__v8hi)__A, (__v8hi)__B); +} + +/*pcom (integer, signed double words) */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comlt_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomltd ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comle_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomled ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comgt_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgtd ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comge_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomged ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comeq_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomeqd ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comneq_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomneqd ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comfalse_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomfalsed ((__v4si)__A, (__v4si)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comtrue_epi32(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomtrued ((__v4si)__A, (__v4si)__B); +} + +/*pcom (integer, signed quad words) */ + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comlt_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomltq ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comle_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomleq ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comgt_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgtq ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comge_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomgeq ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comeq_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomeqq ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comneq_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomneqq ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comfalse_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomfalseq ((__v2di)__A, (__v2di)__B); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_comtrue_epi64(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vpcomtrueq ((__v2di)__A, (__v2di)__B); +} + +/* FRCZ */ + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_frcz_ps (__m128 __A) +{ + return (__m128) __builtin_ia32_vfrczps ((__v4sf)__A); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_frcz_pd (__m128d __A) +{ + return (__m128d) __builtin_ia32_vfrczpd ((__v2df)__A); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_frcz_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_movss ((__v4sf)__A, + (__v4sf) + __builtin_ia32_vfrczss ((__v4sf)__B)); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_frcz_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_movsd ((__v2df)__A, + (__v2df) + __builtin_ia32_vfrczsd ((__v2df)__B)); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_frcz_ps (__m256 __A) +{ + return (__m256) __builtin_ia32_vfrczps256 ((__v8sf)__A); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_frcz_pd (__m256d __A) +{ + return (__m256d) __builtin_ia32_vfrczpd256 ((__v4df)__A); +} + +/* PERMIL2 */ + +#ifdef __OPTIMIZE__ +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permute2_pd (__m128d __X, __m128d __Y, __m128i __C, const int __I) +{ + return (__m128d) __builtin_ia32_vpermil2pd ((__v2df)__X, + (__v2df)__Y, + (__v2di)__C, + __I); +} + +extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute2_pd (__m256d __X, __m256d __Y, __m256i __C, const int __I) +{ + return (__m256d) __builtin_ia32_vpermil2pd256 ((__v4df)__X, + (__v4df)__Y, + (__v4di)__C, + __I); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_permute2_ps (__m128 __X, __m128 __Y, __m128i __C, const int __I) +{ + return (__m128) __builtin_ia32_vpermil2ps ((__v4sf)__X, + (__v4sf)__Y, + (__v4si)__C, + __I); +} + +extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm256_permute2_ps (__m256 __X, __m256 __Y, __m256i __C, const int __I) +{ + return (__m256) __builtin_ia32_vpermil2ps256 ((__v8sf)__X, + (__v8sf)__Y, + (__v8si)__C, + __I); +} +#else +#define _mm_permute2_pd(X, Y, C, I) \ + ((__m128d) __builtin_ia32_vpermil2pd ((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), \ + (__v2di)(__m128i)(C), \ + (int)(I))) + +#define _mm256_permute2_pd(X, Y, C, I) \ + ((__m256d) __builtin_ia32_vpermil2pd256 ((__v4df)(__m256d)(X), \ + (__v4df)(__m256d)(Y), \ + (__v4di)(__m256i)(C), \ + (int)(I))) + +#define _mm_permute2_ps(X, Y, C, I) \ + ((__m128) __builtin_ia32_vpermil2ps ((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), \ + (__v4si)(__m128i)(C), \ + (int)(I))) + +#define _mm256_permute2_ps(X, Y, C, I) \ + ((__m256) __builtin_ia32_vpermil2ps256 ((__v8sf)(__m256)(X), \ + (__v8sf)(__m256)(Y), \ + (__v8si)(__m256i)(C), \ + (int)(I))) +#endif /* __OPTIMIZE__ */ + +#ifdef __DISABLE_XOP__ +#undef __DISABLE_XOP__ +#pragma GCC pop_options +#endif /* __DISABLE_XOP__ */ + +#endif /* _XOPMMINTRIN_H_INCLUDED */ diff --git a/include-gcc/xsavecintrin.h b/include-gcc/xsavecintrin.h new file mode 100644 index 0000000..185863a --- /dev/null +++ b/include-gcc/xsavecintrin.h @@ -0,0 +1,58 @@ +/* Copyright (C) 2014-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _XSAVECINTRIN_H_INCLUDED +#define _XSAVECINTRIN_H_INCLUDED + +#ifndef __XSAVEC__ +#pragma GCC push_options +#pragma GCC target("xsavec") +#define __DISABLE_XSAVEC__ +#endif /* __XSAVEC__ */ + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xsavec (void *__P, long long __M) +{ + __builtin_ia32_xsavec (__P, __M); +} + +#ifdef __x86_64__ +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xsavec64 (void *__P, long long __M) +{ + __builtin_ia32_xsavec64 (__P, __M); +} +#endif + +#ifdef __DISABLE_XSAVEC__ +#undef __DISABLE_XSAVEC__ +#pragma GCC pop_options +#endif /* __DISABLE_XSAVEC__ */ + +#endif /* _XSAVECINTRIN_H_INCLUDED */ diff --git a/include-gcc/xsaveintrin.h b/include-gcc/xsaveintrin.h new file mode 100644 index 0000000..092b1fe --- /dev/null +++ b/include-gcc/xsaveintrin.h @@ -0,0 +1,86 @@ +/* Copyright (C) 2012-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _XSAVEINTRIN_H_INCLUDED +#define _XSAVEINTRIN_H_INCLUDED + +#ifndef __XSAVE__ +#pragma GCC push_options +#pragma GCC target("xsave") +#define __DISABLE_XSAVE__ +#endif /* __XSAVE__ */ + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xsave (void *__P, long long __M) +{ + __builtin_ia32_xsave (__P, __M); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xrstor (void *__P, long long __M) +{ + __builtin_ia32_xrstor (__P, __M); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xsetbv (unsigned int __A, long long __V) +{ + __builtin_ia32_xsetbv (__A, __V); +} + +extern __inline long long +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xgetbv (unsigned int __A) +{ + return __builtin_ia32_xgetbv (__A); +} + +#ifdef __x86_64__ +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xsave64 (void *__P, long long __M) +{ + __builtin_ia32_xsave64 (__P, __M); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xrstor64 (void *__P, long long __M) +{ + __builtin_ia32_xrstor64 (__P, __M); +} +#endif + +#ifdef __DISABLE_XSAVE__ +#undef __DISABLE_XSAVE__ +#pragma GCC pop_options +#endif /* __DISABLE_XSAVE__ */ + +#endif /* _XSAVEINTRIN_H_INCLUDED */ diff --git a/include-gcc/xsaveoptintrin.h b/include-gcc/xsaveoptintrin.h new file mode 100644 index 0000000..337b006 --- /dev/null +++ b/include-gcc/xsaveoptintrin.h @@ -0,0 +1,58 @@ +/* Copyright (C) 2012-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _XSAVEOPTINTRIN_H_INCLUDED +#define _XSAVEOPTINTRIN_H_INCLUDED + +#ifndef __XSAVEOPT__ +#pragma GCC push_options +#pragma GCC target("xsaveopt") +#define __DISABLE_XSAVEOPT__ +#endif /* __XSAVEOPT__ */ + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xsaveopt (void *__P, long long __M) +{ + __builtin_ia32_xsaveopt (__P, __M); +} + +#ifdef __x86_64__ +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xsaveopt64 (void *__P, long long __M) +{ + __builtin_ia32_xsaveopt64 (__P, __M); +} +#endif + +#ifdef __DISABLE_XSAVEOPT__ +#undef __DISABLE_XSAVEOPT__ +#pragma GCC pop_options +#endif /* __DISABLE_XSAVEOPT__ */ + +#endif /* _XSAVEOPTINTRIN_H_INCLUDED */ diff --git a/include-gcc/xsavesintrin.h b/include-gcc/xsavesintrin.h new file mode 100644 index 0000000..6a230d0 --- /dev/null +++ b/include-gcc/xsavesintrin.h @@ -0,0 +1,72 @@ +/* Copyright (C) 2014-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _XSAVESINTRIN_H_INCLUDED +#define _XSAVESINTRIN_H_INCLUDED + +#ifndef __XSAVES__ +#pragma GCC push_options +#pragma GCC target("xsaves") +#define __DISABLE_XSAVES__ +#endif /* __XSAVES__ */ + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xsaves (void *__P, long long __M) +{ + __builtin_ia32_xsaves (__P, __M); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xrstors (void *__P, long long __M) +{ + __builtin_ia32_xrstors (__P, __M); +} + +#ifdef __x86_64__ +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xrstors64 (void *__P, long long __M) +{ + __builtin_ia32_xrstors64 (__P, __M); +} + +extern __inline void +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xsaves64 (void *__P, long long __M) +{ + __builtin_ia32_xsaves64 (__P, __M); +} +#endif + +#ifdef __DISABLE_XSAVES__ +#undef __DISABLE_XSAVES__ +#pragma GCC pop_options +#endif /* __DISABLE_XSAVES__ */ + +#endif /* _XSAVESINTRIN_H_INCLUDED */ diff --git a/include-gcc/xtestintrin.h b/include-gcc/xtestintrin.h new file mode 100644 index 0000000..7216e80 --- /dev/null +++ b/include-gcc/xtestintrin.h @@ -0,0 +1,51 @@ +/* Copyright (C) 2012-2023 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _X86GPRINTRIN_H_INCLUDED +# error "Never use directly; include instead." +#endif + +#ifndef _XTESTINTRIN_H_INCLUDED +#define _XTESTINTRIN_H_INCLUDED + +#ifndef __RTM__ +#pragma GCC push_options +#pragma GCC target("rtm") +#define __DISABLE_RTM__ +#endif /* __RTM__ */ + +/* Return non-zero if the instruction executes inside an RTM or HLE code + region. Return zero otherwise. */ +extern __inline int +__attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_xtest (void) +{ + return __builtin_ia32_xtest (); +} + +#ifdef __DISABLE_RTM__ +#undef __DISABLE_RTM__ +#pragma GCC pop_options +#endif /* __DISABLE_RTM__ */ + +#endif /* _XTESTINTRIN_H_INCLUDED */