(__SIZE_TYPE__)(stride));
}
+#if (__clang_major__ >= 13)
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
return __builtin_ia32_tileloaddt164_internal(m, n, base,
(__SIZE_TYPE__)(stride));
}
+#endif
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
}
-
+#if (__clang_major__ >= 13)
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
_tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
}
+#endif
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ void __DEFAULT_FN_ATTRS_INT8
(__SIZE_TYPE__)(stride), tile);
}
+#if (__clang_major__ >= 13)
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
_tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
_tile1024i dst, _tile1024i src1, _tile1024i src2) {
return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
}
+#endif
/// This struct pack the shape and tile data together for user. We suggest
/// initializing the struct as early as possible, because compiler depends
dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
}
+#if (__clang_major__ >= 13)
/// Load tile rows from memory specifieid by "base" address and "stride" into
/// destination tile "dst". This intrinsic provides a hint to the implementation
/// that the data will likely not be reused in the near future and the data
__SIZE_TYPE__ stride) {
dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
}
+#endif
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
src0.tile, src1.tile);
}
+#if (__clang_major__ >= 13)
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
src0.tile, src1.tile);
}
+#endif
/// Store the tile specified by "src" to memory specifieid by "base" address and
/// "stride".
dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
}
+#if (__clang_major__ >= 13)
/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
/// src1, accumulating the intermediate single-precision (32-bit) floating-point
/// elements with elements in "dst", and store the 32-bit result back to tile
dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
src0.tile, src1.tile);
}
+#endif
#undef __DEFAULT_FN_ATTRS_TILE
#undef __DEFAULT_FN_ATTRS_INT8
#endif
}
+#if (__clang_major__ < 13)
+#define _mm512_mask_reduce_operator(op) \
+ __m256d __t1 = _mm512_extractf64x4_pd(__V, 0); \
+ __m256d __t2 = _mm512_extractf64x4_pd(__V, 1); \
+ __m256d __t3 = _mm256_##op(__t1, __t2); \
+ __m128d __t4 = _mm256_extractf128_pd(__t3, 0); \
+ __m128d __t5 = _mm256_extractf128_pd(__t3, 1); \
+ __m128d __t6 = _mm_##op(__t4, __t5); \
+ __m128d __t7 = __builtin_shufflevector(__t6, __t6, 1, 0); \
+ __m128d __t8 = _mm_##op(__t6, __t7); \
+ return __t8[0]
+#endif
+
static __inline__ double __DEFAULT_FN_ATTRS512
_mm512_reduce_max_pd(__m512d __V) {
+#if (__clang_major__ < 13)
+ _mm512_mask_reduce_operator(max_pd);
+#else
return __builtin_ia32_reduce_fmax_pd512(__V);
+#endif
}
static __inline__ double __DEFAULT_FN_ATTRS512
_mm512_reduce_min_pd(__m512d __V) {
+#if (__clang_major__ < 13)
+ _mm512_mask_reduce_operator(min_pd);
+#else
return __builtin_ia32_reduce_fmin_pd512(__V);
+#endif
}
static __inline__ double __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) {
__V = _mm512_mask_mov_pd(_mm512_set1_pd(-__builtin_inf()), __M, __V);
+#if (__clang_major__ < 13)
+ _mm512_mask_reduce_operator(max_pd);
+#else
return __builtin_ia32_reduce_fmax_pd512(__V);
+#endif
}
static __inline__ double __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) {
__V = _mm512_mask_mov_pd(_mm512_set1_pd(__builtin_inf()), __M, __V);
+#if (__clang_major__ < 13)
+ _mm512_mask_reduce_operator(min_pd);
+#else
return __builtin_ia32_reduce_fmin_pd512(__V);
+#endif
}
+#if (__clang_major__ < 13)
+#undef _mm512_mask_reduce_operator
+
+#define _mm512_mask_reduce_operator(op) \
+ __m256 __t1 = (__m256)_mm512_extractf64x4_pd((__m512d)__V, 0); \
+ __m256 __t2 = (__m256)_mm512_extractf64x4_pd((__m512d)__V, 1); \
+ __m256 __t3 = _mm256_##op(__t1, __t2); \
+ __m128 __t4 = _mm256_extractf128_ps(__t3, 0); \
+ __m128 __t5 = _mm256_extractf128_ps(__t3, 1); \
+ __m128 __t6 = _mm_##op(__t4, __t5); \
+ __m128 __t7 = __builtin_shufflevector(__t6, __t6, 2, 3, 0, 1); \
+ __m128 __t8 = _mm_##op(__t6, __t7); \
+ __m128 __t9 = __builtin_shufflevector(__t8, __t8, 1, 0, 3, 2); \
+ __m128 __t10 = _mm_##op(__t8, __t9); \
+ return __t10[0]
+#endif
static __inline__ float __DEFAULT_FN_ATTRS512
_mm512_reduce_max_ps(__m512 __V) {
+#if (__clang_major__ < 13)
+ _mm512_mask_reduce_operator(max_ps);
+#else
return __builtin_ia32_reduce_fmax_ps512(__V);
+#endif
}
static __inline__ float __DEFAULT_FN_ATTRS512
_mm512_reduce_min_ps(__m512 __V) {
+#if (__clang_major__ < 13)
+ _mm512_mask_reduce_operator(min_ps);
+#else
return __builtin_ia32_reduce_fmin_ps512(__V);
+#endif
}
static __inline__ float __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) {
__V = _mm512_mask_mov_ps(_mm512_set1_ps(-__builtin_inff()), __M, __V);
+#if (__clang_major__ < 13)
+ _mm512_mask_reduce_operator(max_ps);
+#else
return __builtin_ia32_reduce_fmax_ps512(__V);
+#endif
}
static __inline__ float __DEFAULT_FN_ATTRS512
_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) {
__V = _mm512_mask_mov_ps(_mm512_set1_ps(__builtin_inff()), __M, __V);
+#if (__clang_major__ < 13)
+ _mm512_mask_reduce_operator(min_ps);
+#else
return __builtin_ia32_reduce_fmin_ps512(__V);
+#endif
}
+#if (__clang_major__ < 13)
+#undef _mm512_mask_reduce_operator
+#endif
/// Moves the least significant 32 bits of a vector of [16 x i32] to a
/// 32-bit signed integer value.