// Copyright 2020 Google LLC // SPDX-License-Identifier: Apache-2.0 // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef HIGHWAY_HWY_BASE_H_ #define HIGHWAY_HWY_BASE_H_ // For SIMD module implementations and their callers, target-independent. // IWYU pragma: begin_exports #include #include #include "hwy/detect_compiler_arch.h" #include "hwy/highway_export.h" // "IWYU pragma: keep" does not work for these includes, so hide from the IDE. #if !HWY_IDE #if !defined(HWY_NO_LIBCXX) #ifndef __STDC_FORMAT_MACROS #define __STDC_FORMAT_MACROS // before inttypes.h #endif #include #endif #if (HWY_ARCH_X86 && !defined(HWY_NO_LIBCXX)) || HWY_COMPILER_MSVC #include #endif #endif // !HWY_IDE // IWYU pragma: end_exports #if HWY_COMPILER_MSVC #include // memcpy #endif //------------------------------------------------------------------------------ // Compiler-specific definitions #define HWY_STR_IMPL(macro) #macro #define HWY_STR(macro) HWY_STR_IMPL(macro) #if HWY_COMPILER_MSVC #include #define HWY_RESTRICT __restrict #define HWY_INLINE __forceinline #define HWY_NOINLINE __declspec(noinline) #define HWY_FLATTEN #define HWY_NORETURN __declspec(noreturn) #define HWY_LIKELY(expr) (expr) #define HWY_UNLIKELY(expr) (expr) #define HWY_PRAGMA(tokens) __pragma(tokens) #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(warning(tokens)) #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(msc) #define HWY_MAYBE_UNUSED #define HWY_HAS_ASSUME_ALIGNED 0 #if (_MSC_VER >= 1700) #define HWY_MUST_USE_RESULT _Check_return_ #else #define HWY_MUST_USE_RESULT #endif #else #define HWY_RESTRICT __restrict__ // force inlining without optimization enabled creates very inefficient code // that can cause compiler timeout #ifdef __OPTIMIZE__ #define HWY_INLINE inline __attribute__((always_inline)) #else #define HWY_INLINE inline #endif #define HWY_NOINLINE __attribute__((noinline)) #define HWY_FLATTEN __attribute__((flatten)) #define HWY_NORETURN __attribute__((noreturn)) #define HWY_LIKELY(expr) __builtin_expect(!!(expr), 1) #define HWY_UNLIKELY(expr) __builtin_expect(!!(expr), 0) #define HWY_PRAGMA(tokens) _Pragma(#tokens) #define HWY_DIAGNOSTICS(tokens) HWY_PRAGMA(GCC diagnostic tokens) #define HWY_DIAGNOSTICS_OFF(msc, gcc) HWY_DIAGNOSTICS(gcc) // Encountered "attribute list cannot appear here" when using the C++17 // [[maybe_unused]], so only use the old style attribute for now. #define HWY_MAYBE_UNUSED __attribute__((unused)) #define HWY_MUST_USE_RESULT __attribute__((warn_unused_result)) #endif // !HWY_COMPILER_MSVC //------------------------------------------------------------------------------ // Builtin/attributes (no more #include after this point due to namespace!) namespace hwy { // Enables error-checking of format strings. #if HWY_HAS_ATTRIBUTE(__format__) #define HWY_FORMAT(idx_fmt, idx_arg) \ __attribute__((__format__(__printf__, idx_fmt, idx_arg))) #else #define HWY_FORMAT(idx_fmt, idx_arg) #endif // Returns a void* pointer which the compiler then assumes is N-byte aligned. // Example: float* HWY_RESTRICT aligned = (float*)HWY_ASSUME_ALIGNED(in, 32); // // The assignment semantics are required by GCC/Clang. ICC provides an in-place // __assume_aligned, whereas MSVC's __assume appears unsuitable. #if HWY_HAS_BUILTIN(__builtin_assume_aligned) #define HWY_ASSUME_ALIGNED(ptr, align) __builtin_assume_aligned((ptr), (align)) #else #define HWY_ASSUME_ALIGNED(ptr, align) (ptr) /* not supported */ #endif // Clang and GCC require attributes on each function into which SIMD intrinsics // are inlined. Support both per-function annotation (HWY_ATTR) for lambdas and // automatic annotation via pragmas. #if HWY_COMPILER_ICC // As of ICC 2021.{1-9} the pragma is neither implemented nor required. #define HWY_PUSH_ATTRIBUTES(targets_str) #define HWY_POP_ATTRIBUTES #elif HWY_COMPILER_CLANG #define HWY_PUSH_ATTRIBUTES(targets_str) \ HWY_PRAGMA(clang attribute push(__attribute__((target(targets_str))), \ apply_to = function)) #define HWY_POP_ATTRIBUTES HWY_PRAGMA(clang attribute pop) #elif HWY_COMPILER_GCC_ACTUAL #define HWY_PUSH_ATTRIBUTES(targets_str) \ HWY_PRAGMA(GCC push_options) HWY_PRAGMA(GCC target targets_str) #define HWY_POP_ATTRIBUTES HWY_PRAGMA(GCC pop_options) #else #define HWY_PUSH_ATTRIBUTES(targets_str) #define HWY_POP_ATTRIBUTES #endif //------------------------------------------------------------------------------ // Macros #define HWY_API static HWY_INLINE HWY_FLATTEN HWY_MAYBE_UNUSED #define HWY_CONCAT_IMPL(a, b) a##b #define HWY_CONCAT(a, b) HWY_CONCAT_IMPL(a, b) #define HWY_MIN(a, b) ((a) < (b) ? (a) : (b)) #define HWY_MAX(a, b) ((a) > (b) ? (a) : (b)) #if HWY_COMPILER_GCC_ACTUAL // nielskm: GCC does not support '#pragma GCC unroll' without the factor. #define HWY_UNROLL(factor) HWY_PRAGMA(GCC unroll factor) #define HWY_DEFAULT_UNROLL HWY_UNROLL(4) #elif HWY_COMPILER_CLANG || HWY_COMPILER_ICC || HWY_COMPILER_ICX #define HWY_UNROLL(factor) HWY_PRAGMA(unroll factor) #define HWY_DEFAULT_UNROLL HWY_UNROLL() #else #define HWY_UNROLL(factor) #define HWY_DEFAULT_UNROLL #endif // Tell a compiler that the expression always evaluates to true. // The expression should be free from any side effects. // Some older compilers may have trouble with complex expressions, therefore // it is advisable to split multiple conditions into separate assume statements, // and manually check the generated code. // OK but could fail: // HWY_ASSUME(x == 2 && y == 3); // Better: // HWY_ASSUME(x == 2); // HWY_ASSUME(y == 3); #if HWY_HAS_CPP_ATTRIBUTE(assume) #define HWY_ASSUME(expr) [[assume(expr)]] #elif HWY_COMPILER_MSVC || HWY_COMPILER_ICC #define HWY_ASSUME(expr) __assume(expr) // __builtin_assume() was added in clang 3.6. #elif HWY_COMPILER_CLANG && HWY_HAS_BUILTIN(__builtin_assume) #define HWY_ASSUME(expr) __builtin_assume(expr) // __builtin_unreachable() was added in GCC 4.5, but __has_builtin() was added // later, so check for the compiler version directly. #elif HWY_COMPILER_GCC_ACTUAL >= 405 #define HWY_ASSUME(expr) \ ((expr) ? static_cast(0) : __builtin_unreachable()) #else #define HWY_ASSUME(expr) static_cast(0) #endif // Compile-time fence to prevent undesirable code reordering. On Clang x86, the // typical asm volatile("" : : : "memory") has no effect, whereas atomic fence // does, without generating code. #if HWY_ARCH_X86 && !defined(HWY_NO_LIBCXX) #define HWY_FENCE std::atomic_thread_fence(std::memory_order_acq_rel) #else // TODO(janwas): investigate alternatives. On Arm, the above generates barriers. #define HWY_FENCE #endif // 4 instances of a given literal value, useful as input to LoadDup128. #define HWY_REP4(literal) literal, literal, literal, literal HWY_DLLEXPORT HWY_NORETURN void HWY_FORMAT(3, 4) Abort(const char* file, int line, const char* format, ...); #define HWY_ABORT(format, ...) \ ::hwy::Abort(__FILE__, __LINE__, format, ##__VA_ARGS__) // Always enabled. #define HWY_ASSERT(condition) \ do { \ if (!(condition)) { \ HWY_ABORT("Assert %s", #condition); \ } \ } while (0) #if HWY_HAS_FEATURE(memory_sanitizer) || defined(MEMORY_SANITIZER) #define HWY_IS_MSAN 1 #else #define HWY_IS_MSAN 0 #endif #if HWY_HAS_FEATURE(address_sanitizer) || defined(ADDRESS_SANITIZER) #define HWY_IS_ASAN 1 #else #define HWY_IS_ASAN 0 #endif #if HWY_HAS_FEATURE(thread_sanitizer) || defined(THREAD_SANITIZER) #define HWY_IS_TSAN 1 #else #define HWY_IS_TSAN 0 #endif // MSAN may cause lengthy build times or false positives e.g. in AVX3 DemoteTo. // You can disable MSAN by adding this attribute to the function that fails. #if HWY_IS_MSAN #define HWY_ATTR_NO_MSAN __attribute__((no_sanitize_memory)) #else #define HWY_ATTR_NO_MSAN #endif // For enabling HWY_DASSERT and shortening tests in slower debug builds #if !defined(HWY_IS_DEBUG_BUILD) // Clang does not define NDEBUG, but it and GCC define __OPTIMIZE__, and recent // MSVC defines NDEBUG (if not, could instead check _DEBUG). #if (!defined(__OPTIMIZE__) && !defined(NDEBUG)) || HWY_IS_ASAN || \ HWY_IS_MSAN || HWY_IS_TSAN || defined(__clang_analyzer__) #define HWY_IS_DEBUG_BUILD 1 #else #define HWY_IS_DEBUG_BUILD 0 #endif #endif // HWY_IS_DEBUG_BUILD #if HWY_IS_DEBUG_BUILD #define HWY_DASSERT(condition) HWY_ASSERT(condition) #else #define HWY_DASSERT(condition) \ do { \ } while (0) #endif //------------------------------------------------------------------------------ // CopyBytes / ZeroBytes #if HWY_COMPILER_MSVC #pragma intrinsic(memcpy) #pragma intrinsic(memset) #endif // The source/destination must not overlap/alias. template HWY_API void CopyBytes(const From* from, To* to) { #if HWY_COMPILER_MSVC memcpy(to, from, kBytes); #else __builtin_memcpy(static_cast(to), static_cast(from), kBytes); #endif } HWY_API void CopyBytes(const void* HWY_RESTRICT from, void* HWY_RESTRICT to, size_t num_of_bytes_to_copy) { #if HWY_COMPILER_MSVC memcpy(to, from, num_of_bytes_to_copy); #else __builtin_memcpy(to, from, num_of_bytes_to_copy); #endif } // Same as CopyBytes, but for same-sized objects; avoids a size argument. template HWY_API void CopySameSize(const From* HWY_RESTRICT from, To* HWY_RESTRICT to) { static_assert(sizeof(From) == sizeof(To), ""); CopyBytes(from, to); } template HWY_API void ZeroBytes(To* to) { #if HWY_COMPILER_MSVC memset(to, 0, kBytes); #else __builtin_memset(to, 0, kBytes); #endif } HWY_API void ZeroBytes(void* to, size_t num_bytes) { #if HWY_COMPILER_MSVC memset(to, 0, num_bytes); #else __builtin_memset(to, 0, num_bytes); #endif } //------------------------------------------------------------------------------ // kMaxVectorSize (undocumented, pending removal) #if HWY_ARCH_X86 static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 64; // AVX-512 #elif HWY_ARCH_RVV && defined(__riscv_v_intrinsic) && \ __riscv_v_intrinsic >= 11000 // Not actually an upper bound on the size. static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 4096; #else static constexpr HWY_MAYBE_UNUSED size_t kMaxVectorSize = 16; #endif //------------------------------------------------------------------------------ // Alignment // Potentially useful for LoadDup128 and capped vectors. In other cases, arrays // should be allocated dynamically via aligned_allocator.h because Lanes() may // exceed the stack size. #if HWY_ARCH_X86 #define HWY_ALIGN_MAX alignas(64) #elif HWY_ARCH_RVV && defined(__riscv_v_intrinsic) && \ __riscv_v_intrinsic >= 11000 #define HWY_ALIGN_MAX alignas(8) // only elements need be aligned #else #define HWY_ALIGN_MAX alignas(16) #endif //------------------------------------------------------------------------------ // Lane types // float16_t load/store/conversion intrinsics are always supported on Armv8 and // VFPv4 (except with MSVC). On Armv7 Clang requires __ARM_FP & 2; GCC requires // -mfp16-format=ieee. #if (HWY_ARCH_ARM_A64 && !HWY_COMPILER_MSVC) || \ (HWY_COMPILER_CLANG && defined(__ARM_FP) && (__ARM_FP & 2)) || \ (HWY_COMPILER_GCC_ACTUAL && defined(__ARM_FP16_FORMAT_IEEE)) #define HWY_NEON_HAVE_FLOAT16C 1 #else #define HWY_NEON_HAVE_FLOAT16C 0 #endif // If 1, both __bf16 and a limited set of *_bf16 SVE intrinsics are available: // create/get/set/dup, ld/st, sel, rev, trn, uzp, zip. #if HWY_ARCH_ARM_A64 && defined(__ARM_FEATURE_SVE_BF16) #define HWY_SVE_HAVE_BFLOAT16 1 #else #define HWY_SVE_HAVE_BFLOAT16 0 #endif // Match [u]int##_t naming scheme so rvv-inl.h macros can obtain the type name // by concatenating base type and bits. // 1) ACLE's __fp16 #if HWY_NEON_HAVE_FLOAT16C using float16_t = __fp16; // 2) C11 extension ISO/IEC TS 18661-3:2015 but not supported on all targets. // Required if HWY_HAVE_FLOAT16, i.e. RVV with zvfh or AVX3_SPR (with // sufficiently new compiler supporting avx512fp16). Do not use on clang-cl, // which is missing __extendhfsf2. #elif ( \ (HWY_ARCH_RVV && defined(__riscv_zvfh) && HWY_COMPILER_CLANG) || \ (HWY_ARCH_X86 && ((HWY_COMPILER_CLANG >= 1600 && !HWY_COMPILER_CLANGCL) || \ HWY_COMPILER_GCC_ACTUAL >= 1200))) using float16_t = _Float16; // 3) Otherwise emulate #else #define HWY_EMULATE_FLOAT16 #pragma pack(push, 1) struct float16_t { uint16_t bits; }; #pragma pack(pop) #endif // float16_t #if HWY_SVE_HAVE_BFLOAT16 using bfloat16_t = __bf16; #else #pragma pack(push, 1) struct bfloat16_t { uint16_t bits; }; #pragma pack(pop) #endif // bfloat16_t HWY_API float F32FromF16(float16_t f16) { #ifdef HWY_EMULATE_FLOAT16 uint16_t bits16; CopySameSize(&f16, &bits16); const uint32_t sign = static_cast(bits16 >> 15); const uint32_t biased_exp = (bits16 >> 10) & 0x1F; const uint32_t mantissa = bits16 & 0x3FF; // Subnormal or zero if (biased_exp == 0) { const float subnormal = (1.0f / 16384) * (static_cast(mantissa) * (1.0f / 1024)); return sign ? -subnormal : subnormal; } // Normalized: convert the representation directly (faster than ldexp/tables). const uint32_t biased_exp32 = biased_exp + (127 - 15); const uint32_t mantissa32 = mantissa << (23 - 10); const uint32_t bits32 = (sign << 31) | (biased_exp32 << 23) | mantissa32; float result; CopySameSize(&bits32, &result); return result; #else return static_cast(f16); #endif } HWY_API float16_t F16FromF32(float f32) { #ifdef HWY_EMULATE_FLOAT16 uint32_t bits32; CopySameSize(&f32, &bits32); const uint32_t sign = bits32 >> 31; const uint32_t biased_exp32 = (bits32 >> 23) & 0xFF; const uint32_t mantissa32 = bits32 & 0x7FFFFF; const int32_t exp = HWY_MIN(static_cast(biased_exp32) - 127, 15); // Tiny or zero => zero. float16_t out; if (exp < -24) { // restore original sign const uint16_t bits = static_cast(sign << 15); CopySameSize(&bits, &out); return out; } uint32_t biased_exp16, mantissa16; // exp = [-24, -15] => subnormal if (exp < -14) { biased_exp16 = 0; const uint32_t sub_exp = static_cast(-14 - exp); HWY_DASSERT(1 <= sub_exp && sub_exp < 11); mantissa16 = static_cast((1u << (10 - sub_exp)) + (mantissa32 >> (13 + sub_exp))); } else { // exp = [-14, 15] biased_exp16 = static_cast(exp + 15); HWY_DASSERT(1 <= biased_exp16 && biased_exp16 < 31); mantissa16 = mantissa32 >> 13; } HWY_DASSERT(mantissa16 < 1024); const uint32_t bits16 = (sign << 15) | (biased_exp16 << 10) | mantissa16; HWY_DASSERT(bits16 < 0x10000); const uint16_t narrowed = static_cast(bits16); // big-endian safe CopySameSize(&narrowed, &out); return out; #else return static_cast(f32); #endif } HWY_API float F32FromBF16(bfloat16_t bf) { uint16_t bits16; CopyBytes<2>(&bf, &bits16); uint32_t bits = bits16; bits <<= 16; float f; CopySameSize(&bits, &f); return f; } HWY_API float F32FromF16Mem(const void* ptr) { float16_t f16; CopyBytes<2>(ptr, &f16); return F32FromF16(f16); } HWY_API float F32FromBF16Mem(const void* ptr) { bfloat16_t bf; CopyBytes<2>(ptr, &bf); return F32FromBF16(bf); } HWY_API bfloat16_t BF16FromF32(float f) { uint32_t bits; CopySameSize(&f, &bits); const uint16_t bits16 = static_cast(bits >> 16); bfloat16_t bf; CopySameSize(&bits16, &bf); return bf; } using float32_t = float; using float64_t = double; #pragma pack(push, 1) // Aligned 128-bit type. Cannot use __int128 because clang doesn't yet align it: // https://reviews.llvm.org/D86310 struct alignas(16) uint128_t { uint64_t lo; // little-endian layout uint64_t hi; }; // 64 bit key plus 64 bit value. Faster than using uint128_t when only the key // field is to be compared (Lt128Upper instead of Lt128). struct alignas(16) K64V64 { uint64_t value; // little-endian layout uint64_t key; }; // 32 bit key plus 32 bit value. Allows vqsort recursions to terminate earlier // than when considering both to be a 64-bit key. struct alignas(8) K32V32 { uint32_t value; // little-endian layout uint32_t key; }; #pragma pack(pop) #ifdef HWY_EMULATE_FLOAT16 static inline HWY_MAYBE_UNUSED bool operator<(const float16_t& a, const float16_t& b) { return F32FromF16(a) < F32FromF16(b); } // Required for std::greater. static inline HWY_MAYBE_UNUSED bool operator>(const float16_t& a, const float16_t& b) { return F32FromF16(a) > F32FromF16(b); } static inline HWY_MAYBE_UNUSED bool operator==(const float16_t& a, const float16_t& b) { return F32FromF16(a) == F32FromF16(b); } #endif // HWY_EMULATE_FLOAT16 static inline HWY_MAYBE_UNUSED bool operator<(const uint128_t& a, const uint128_t& b) { return (a.hi == b.hi) ? a.lo < b.lo : a.hi < b.hi; } // Required for std::greater. static inline HWY_MAYBE_UNUSED bool operator>(const uint128_t& a, const uint128_t& b) { return b < a; } static inline HWY_MAYBE_UNUSED bool operator==(const uint128_t& a, const uint128_t& b) { return a.lo == b.lo && a.hi == b.hi; } static inline HWY_MAYBE_UNUSED bool operator<(const K64V64& a, const K64V64& b) { return a.key < b.key; } // Required for std::greater. static inline HWY_MAYBE_UNUSED bool operator>(const K64V64& a, const K64V64& b) { return b < a; } static inline HWY_MAYBE_UNUSED bool operator==(const K64V64& a, const K64V64& b) { return a.key == b.key; } static inline HWY_MAYBE_UNUSED bool operator<(const K32V32& a, const K32V32& b) { return a.key < b.key; } // Required for std::greater. static inline HWY_MAYBE_UNUSED bool operator>(const K32V32& a, const K32V32& b) { return b < a; } static inline HWY_MAYBE_UNUSED bool operator==(const K32V32& a, const K32V32& b) { return a.key == b.key; } //------------------------------------------------------------------------------ // Controlling overload resolution (SFINAE) template struct EnableIfT {}; template <> struct EnableIfT { using type = void; }; template using EnableIf = typename EnableIfT::type; template struct IsSameT { enum { value = 0 }; }; template struct IsSameT { enum { value = 1 }; }; template HWY_API constexpr bool IsSame() { return IsSameT::value; } template struct IfT { using type = Then; }; template struct IfT { using type = Else; }; template using If = typename IfT::type; // Insert into template/function arguments to enable this overload only for // vectors of exactly, at most (LE), or more than (GT) this many bytes. // // As an example, checking for a total size of 16 bytes will match both // Simd and Simd. #define HWY_IF_V_SIZE(T, kN, bytes) \ hwy::EnableIf* = nullptr #define HWY_IF_V_SIZE_LE(T, kN, bytes) \ hwy::EnableIf* = nullptr #define HWY_IF_V_SIZE_GT(T, kN, bytes) \ hwy::EnableIf<(kN * sizeof(T) > bytes)>* = nullptr #define HWY_IF_LANES(kN, lanes) hwy::EnableIf<(kN == lanes)>* = nullptr #define HWY_IF_LANES_LE(kN, lanes) hwy::EnableIf<(kN <= lanes)>* = nullptr #define HWY_IF_LANES_GT(kN, lanes) hwy::EnableIf<(kN > lanes)>* = nullptr #define HWY_IF_UNSIGNED(T) hwy::EnableIf()>* = nullptr #define HWY_IF_SIGNED(T) \ hwy::EnableIf() && !IsFloat() && !IsSpecialFloat()>* = \ nullptr #define HWY_IF_FLOAT(T) hwy::EnableIf()>* = nullptr #define HWY_IF_NOT_FLOAT(T) hwy::EnableIf()>* = nullptr #define HWY_IF_FLOAT3264(T) hwy::EnableIf()>* = nullptr #define HWY_IF_NOT_FLOAT3264(T) hwy::EnableIf()>* = nullptr #define HWY_IF_SPECIAL_FLOAT(T) \ hwy::EnableIf()>* = nullptr #define HWY_IF_NOT_SPECIAL_FLOAT(T) \ hwy::EnableIf()>* = nullptr #define HWY_IF_FLOAT_OR_SPECIAL(T) \ hwy::EnableIf() || hwy::IsSpecialFloat()>* = nullptr #define HWY_IF_NOT_FLOAT_NOR_SPECIAL(T) \ hwy::EnableIf() && !hwy::IsSpecialFloat()>* = nullptr #define HWY_IF_T_SIZE(T, bytes) hwy::EnableIf* = nullptr #define HWY_IF_NOT_T_SIZE(T, bytes) \ hwy::EnableIf* = nullptr // bit_array = 0x102 means 1 or 8 bytes. There is no NONE_OF because it sounds // too similar. If you want the opposite of this (2 or 4 bytes), ask for those // bits explicitly (0x14) instead of attempting to 'negate' 0x102. #define HWY_IF_T_SIZE_ONE_OF(T, bit_array) \ hwy::EnableIf<((size_t{1} << sizeof(T)) & (bit_array)) != 0>* = nullptr // Use instead of HWY_IF_T_SIZE to avoid ambiguity with float16_t/float/double // overloads. #define HWY_IF_UI16(T) \ hwy::EnableIf() || IsSame()>* = nullptr #define HWY_IF_UI32(T) \ hwy::EnableIf() || IsSame()>* = nullptr #define HWY_IF_UI64(T) \ hwy::EnableIf() || IsSame()>* = nullptr #define HWY_IF_LANES_PER_BLOCK(T, N, LANES) \ hwy::EnableIf* = nullptr // Empty struct used as a size tag type. template struct SizeTag {}; template struct RemoveConstT { using type = T; }; template struct RemoveConstT { using type = T; }; template using RemoveConst = typename RemoveConstT::type; template struct RemoveRefT { using type = T; }; template struct RemoveRefT { using type = T; }; template struct RemoveRefT { using type = T; }; template using RemoveRef = typename RemoveRefT::type; //------------------------------------------------------------------------------ // Type relations namespace detail { template struct Relations; template <> struct Relations { using Unsigned = uint8_t; using Signed = int8_t; using Wide = uint16_t; enum { is_signed = 0, is_float = 0, is_bf16 = 0 }; }; template <> struct Relations { using Unsigned = uint8_t; using Signed = int8_t; using Wide = int16_t; enum { is_signed = 1, is_float = 0, is_bf16 = 0 }; }; template <> struct Relations { using Unsigned = uint16_t; using Signed = int16_t; using Float = float16_t; using Wide = uint32_t; using Narrow = uint8_t; enum { is_signed = 0, is_float = 0, is_bf16 = 0 }; }; template <> struct Relations { using Unsigned = uint16_t; using Signed = int16_t; using Float = float16_t; using Wide = int32_t; using Narrow = int8_t; enum { is_signed = 1, is_float = 0, is_bf16 = 0 }; }; template <> struct Relations { using Unsigned = uint32_t; using Signed = int32_t; using Float = float; using Wide = uint64_t; using Narrow = uint16_t; enum { is_signed = 0, is_float = 0, is_bf16 = 0 }; }; template <> struct Relations { using Unsigned = uint32_t; using Signed = int32_t; using Float = float; using Wide = int64_t; using Narrow = int16_t; enum { is_signed = 1, is_float = 0, is_bf16 = 0 }; }; template <> struct Relations { using Unsigned = uint64_t; using Signed = int64_t; using Float = double; using Wide = uint128_t; using Narrow = uint32_t; enum { is_signed = 0, is_float = 0, is_bf16 = 0 }; }; template <> struct Relations { using Unsigned = uint64_t; using Signed = int64_t; using Float = double; using Narrow = int32_t; enum { is_signed = 1, is_float = 0, is_bf16 = 0 }; }; template <> struct Relations { using Unsigned = uint128_t; using Narrow = uint64_t; enum { is_signed = 0, is_float = 0, is_bf16 = 0 }; }; template <> struct Relations { using Unsigned = uint16_t; using Signed = int16_t; using Float = float16_t; using Wide = float; enum { is_signed = 1, is_float = 1, is_bf16 = 0 }; }; template <> struct Relations { using Unsigned = uint16_t; using Signed = int16_t; using Wide = float; enum { is_signed = 1, is_float = 1, is_bf16 = 1 }; }; template <> struct Relations { using Unsigned = uint32_t; using Signed = int32_t; using Float = float; using Wide = double; using Narrow = float16_t; enum { is_signed = 1, is_float = 1, is_bf16 = 0 }; }; template <> struct Relations { using Unsigned = uint64_t; using Signed = int64_t; using Float = double; using Narrow = float; enum { is_signed = 1, is_float = 1, is_bf16 = 0 }; }; template struct TypeFromSize; template <> struct TypeFromSize<1> { using Unsigned = uint8_t; using Signed = int8_t; }; template <> struct TypeFromSize<2> { using Unsigned = uint16_t; using Signed = int16_t; using Float = float16_t; }; template <> struct TypeFromSize<4> { using Unsigned = uint32_t; using Signed = int32_t; using Float = float; }; template <> struct TypeFromSize<8> { using Unsigned = uint64_t; using Signed = int64_t; using Float = double; }; template <> struct TypeFromSize<16> { using Unsigned = uint128_t; }; } // namespace detail // Aliases for types of a different category, but the same size. template using MakeUnsigned = typename detail::Relations::Unsigned; template using MakeSigned = typename detail::Relations::Signed; template using MakeFloat = typename detail::Relations::Float; // Aliases for types of the same category, but different size. template using MakeWide = typename detail::Relations::Wide; template using MakeNarrow = typename detail::Relations::Narrow; // Obtain type from its size [bytes]. template using UnsignedFromSize = typename detail::TypeFromSize::Unsigned; template using SignedFromSize = typename detail::TypeFromSize::Signed; template using FloatFromSize = typename detail::TypeFromSize::Float; // Avoid confusion with SizeTag where the parameter is a lane size. using UnsignedTag = SizeTag<0>; using SignedTag = SizeTag<0x100>; // integer using FloatTag = SizeTag<0x200>; using SpecialTag = SizeTag<0x300>; template > constexpr auto TypeTag() -> hwy::SizeTag<((R::is_signed + R::is_float + R::is_bf16) << 8)> { return hwy::SizeTag<((R::is_signed + R::is_float + R::is_bf16) << 8)>(); } // For when we only want to distinguish FloatTag from everything else. using NonFloatTag = SizeTag<0x400>; template > constexpr auto IsFloatTag() -> hwy::SizeTag<(R::is_float ? 0x200 : 0x400)> { return hwy::SizeTag<(R::is_float ? 0x200 : 0x400)>(); } //------------------------------------------------------------------------------ // Type traits template HWY_API constexpr bool IsFloat3264() { return IsSame() || IsSame(); } template HWY_API constexpr bool IsFloat() { // Cannot use T(1.25) != T(1) for float16_t, which can only be converted to or // from a float, not compared. Include float16_t in case HWY_HAVE_FLOAT16=1. return IsSame() || IsFloat3264(); } // These types are often special-cased and not supported in all ops. template HWY_API constexpr bool IsSpecialFloat() { return IsSame() || IsSame(); } template HWY_API constexpr bool IsSigned() { return T(0) > T(-1); } template <> constexpr bool IsSigned() { return true; } template <> constexpr bool IsSigned() { return true; } // Largest/smallest representable integer values. template HWY_API constexpr T LimitsMax() { static_assert(!IsFloat(), "Only for integer types"); using TU = MakeUnsigned; return static_cast(IsSigned() ? (static_cast(~0ull) >> 1) : static_cast(~0ull)); } template HWY_API constexpr T LimitsMin() { static_assert(!IsFloat(), "Only for integer types"); return IsSigned() ? T(-1) - LimitsMax() : T(0); } // Largest/smallest representable value (integer or float). This naming avoids // confusion with numeric_limits::min() (the smallest positive value). // Cannot be constexpr because we use CopySameSize for [b]float16_t. template HWY_API T LowestValue() { return LimitsMin(); } template <> HWY_INLINE bfloat16_t LowestValue() { const uint16_t kBits = 0xFF7F; // -1.1111111 x 2^127 bfloat16_t ret; CopySameSize(&kBits, &ret); return ret; } template <> HWY_INLINE float16_t LowestValue() { const uint16_t kBits = 0xFBFF; // -1.1111111111 x 2^15 float16_t ret; CopySameSize(&kBits, &ret); return ret; } template <> HWY_INLINE float LowestValue() { return -3.402823466e+38F; } template <> HWY_INLINE double LowestValue() { return -1.7976931348623158e+308; } template HWY_API T HighestValue() { return LimitsMax(); } template <> HWY_INLINE bfloat16_t HighestValue() { const uint16_t kBits = 0x7F7F; // 1.1111111 x 2^127 bfloat16_t ret; CopySameSize(&kBits, &ret); return ret; } template <> HWY_INLINE float16_t HighestValue() { const uint16_t kBits = 0x7BFF; // 1.1111111111 x 2^15 float16_t ret; CopySameSize(&kBits, &ret); return ret; } template <> HWY_INLINE float HighestValue() { return 3.402823466e+38F; } template <> HWY_INLINE double HighestValue() { return 1.7976931348623158e+308; } // Difference between 1.0 and the next representable value. Equal to // 1 / (1ULL << MantissaBits()), but hard-coding ensures precision. template HWY_API T Epsilon() { return 1; } template <> HWY_INLINE bfloat16_t Epsilon() { const uint16_t kBits = 0x3C00; // 0.0078125 bfloat16_t ret; CopySameSize(&kBits, &ret); return ret; } template <> HWY_INLINE float16_t Epsilon() { const uint16_t kBits = 0x1400; // 0.0009765625 float16_t ret; CopySameSize(&kBits, &ret); return ret; } template <> HWY_INLINE float Epsilon() { return 1.192092896e-7f; } template <> HWY_INLINE double Epsilon() { return 2.2204460492503131e-16; } // Returns width in bits of the mantissa field in IEEE binary16/32/64. template constexpr int MantissaBits() { static_assert(sizeof(T) == 0, "Only instantiate the specializations"); return 0; } template <> constexpr int MantissaBits() { return 7; } template <> constexpr int MantissaBits() { return 10; } template <> constexpr int MantissaBits() { return 23; } template <> constexpr int MantissaBits() { return 52; } // Returns the (left-shifted by one bit) IEEE binary16/32/64 representation with // the largest possible (biased) exponent field. Used by IsInf. template constexpr MakeSigned MaxExponentTimes2() { return -(MakeSigned{1} << (MantissaBits() + 1)); } // Returns bitmask of the sign bit in IEEE binary16/32/64. template constexpr MakeUnsigned SignMask() { return MakeUnsigned{1} << (sizeof(T) * 8 - 1); } // Returns bitmask of the exponent field in IEEE binary16/32/64. template constexpr MakeUnsigned ExponentMask() { return (~(MakeUnsigned{1} << MantissaBits()) + 1) & ~SignMask(); } // Returns bitmask of the mantissa field in IEEE binary16/32/64. template constexpr MakeUnsigned MantissaMask() { return (MakeUnsigned{1} << MantissaBits()) - 1; } // Returns 1 << mantissa_bits as a floating-point number. All integers whose // absolute value are less than this can be represented exactly. template HWY_INLINE T MantissaEnd() { static_assert(sizeof(T) == 0, "Only instantiate the specializations"); return 0; } template <> HWY_INLINE bfloat16_t MantissaEnd() { const uint16_t kBits = 0x4300; // 1.0 x 2^7 bfloat16_t ret; CopySameSize(&kBits, &ret); return ret; } template <> HWY_INLINE float16_t MantissaEnd() { const uint16_t kBits = 0x6400; // 1.0 x 2^10 float16_t ret; CopySameSize(&kBits, &ret); return ret; } template <> HWY_INLINE float MantissaEnd() { return 8388608.0f; // 1 << 23 } template <> HWY_INLINE double MantissaEnd() { // floating point literal with p52 requires C++17. return 4503599627370496.0; // 1 << 52 } // Returns width in bits of the exponent field in IEEE binary16/32/64. template constexpr int ExponentBits() { // Exponent := remaining bits after deducting sign and mantissa. return 8 * sizeof(T) - 1 - MantissaBits(); } // Returns largest value of the biased exponent field in IEEE binary16/32/64, // right-shifted so that the LSB is bit zero. Example: 0xFF for float. // This is expressed as a signed integer for more efficient comparison. template constexpr MakeSigned MaxExponentField() { return (MakeSigned{1} << ExponentBits()) - 1; } //------------------------------------------------------------------------------ // Helper functions template constexpr inline T1 DivCeil(T1 a, T2 b) { return (a + b - 1) / b; } // Works for any `align`; if a power of two, compiler emits ADD+AND. constexpr inline size_t RoundUpTo(size_t what, size_t align) { return DivCeil(what, align) * align; } // Undefined results for x == 0. HWY_API size_t Num0BitsBelowLS1Bit_Nonzero32(const uint32_t x) { #if HWY_COMPILER_MSVC unsigned long index; // NOLINT _BitScanForward(&index, x); return index; #else // HWY_COMPILER_MSVC return static_cast(__builtin_ctz(x)); #endif // HWY_COMPILER_MSVC } HWY_API size_t Num0BitsBelowLS1Bit_Nonzero64(const uint64_t x) { #if HWY_COMPILER_MSVC #if HWY_ARCH_X86_64 unsigned long index; // NOLINT _BitScanForward64(&index, x); return index; #else // HWY_ARCH_X86_64 // _BitScanForward64 not available uint32_t lsb = static_cast(x & 0xFFFFFFFF); unsigned long index; // NOLINT if (lsb == 0) { uint32_t msb = static_cast(x >> 32u); _BitScanForward(&index, msb); return 32 + index; } else { _BitScanForward(&index, lsb); return index; } #endif // HWY_ARCH_X86_64 #else // HWY_COMPILER_MSVC return static_cast(__builtin_ctzll(x)); #endif // HWY_COMPILER_MSVC } // Undefined results for x == 0. HWY_API size_t Num0BitsAboveMS1Bit_Nonzero32(const uint32_t x) { #if HWY_COMPILER_MSVC unsigned long index; // NOLINT _BitScanReverse(&index, x); return 31 - index; #else // HWY_COMPILER_MSVC return static_cast(__builtin_clz(x)); #endif // HWY_COMPILER_MSVC } HWY_API size_t Num0BitsAboveMS1Bit_Nonzero64(const uint64_t x) { #if HWY_COMPILER_MSVC #if HWY_ARCH_X86_64 unsigned long index; // NOLINT _BitScanReverse64(&index, x); return 63 - index; #else // HWY_ARCH_X86_64 // _BitScanReverse64 not available const uint32_t msb = static_cast(x >> 32u); unsigned long index; // NOLINT if (msb == 0) { const uint32_t lsb = static_cast(x & 0xFFFFFFFF); _BitScanReverse(&index, lsb); return 63 - index; } else { _BitScanReverse(&index, msb); return 31 - index; } #endif // HWY_ARCH_X86_64 #else // HWY_COMPILER_MSVC return static_cast(__builtin_clzll(x)); #endif // HWY_COMPILER_MSVC } HWY_API size_t PopCount(uint64_t x) { #if HWY_COMPILER_GCC // includes clang return static_cast(__builtin_popcountll(x)); // This instruction has a separate feature flag, but is often called from // non-SIMD code, so we don't want to require dynamic dispatch. It was first // supported by Intel in Nehalem (SSE4.2), but MSVC only predefines a macro // for AVX, so check for that. #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 && defined(__AVX__) return _mm_popcnt_u64(x); #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_32 && defined(__AVX__) return _mm_popcnt_u32(static_cast(x & 0xFFFFFFFFu)) + _mm_popcnt_u32(static_cast(x >> 32)); #else x -= ((x >> 1) & 0x5555555555555555ULL); x = (((x >> 2) & 0x3333333333333333ULL) + (x & 0x3333333333333333ULL)); x = (((x >> 4) + x) & 0x0F0F0F0F0F0F0F0FULL); x += (x >> 8); x += (x >> 16); x += (x >> 32); return static_cast(x & 0x7Fu); #endif } // Skip HWY_API due to GCC "function not considered for inlining". Previously // such errors were caused by underlying type mismatches, but it's not clear // what is still mismatched despite all the casts. template /*HWY_API*/ constexpr size_t FloorLog2(TI x) { return x == TI{1} ? 0 : static_cast(FloorLog2(static_cast(x >> 1)) + 1); } template /*HWY_API*/ constexpr size_t CeilLog2(TI x) { return x == TI{1} ? 0 : static_cast(FloorLog2(static_cast(x - 1)) + 1); } template HWY_INLINE constexpr T AddWithWraparound(hwy::FloatTag /*tag*/, T t, size_t n) { return t + static_cast(n); } template HWY_INLINE constexpr T AddWithWraparound(hwy::NonFloatTag /*tag*/, T t, size_t n) { using TU = MakeUnsigned; return static_cast( static_cast(static_cast(t) + static_cast(n)) & hwy::LimitsMax()); } #if HWY_COMPILER_MSVC && HWY_ARCH_X86_64 #pragma intrinsic(_umul128) #endif // 64 x 64 = 128 bit multiplication HWY_API uint64_t Mul128(uint64_t a, uint64_t b, uint64_t* HWY_RESTRICT upper) { #if defined(__SIZEOF_INT128__) __uint128_t product = (__uint128_t)a * (__uint128_t)b; *upper = (uint64_t)(product >> 64); return (uint64_t)(product & 0xFFFFFFFFFFFFFFFFULL); #elif HWY_COMPILER_MSVC && HWY_ARCH_X86_64 return _umul128(a, b, upper); #else constexpr uint64_t kLo32 = 0xFFFFFFFFU; const uint64_t lo_lo = (a & kLo32) * (b & kLo32); const uint64_t hi_lo = (a >> 32) * (b & kLo32); const uint64_t lo_hi = (a & kLo32) * (b >> 32); const uint64_t hi_hi = (a >> 32) * (b >> 32); const uint64_t t = (lo_lo >> 32) + (hi_lo & kLo32) + lo_hi; *upper = (hi_lo >> 32) + (t >> 32) + hi_hi; return (t << 32) | (lo_lo & kLo32); #endif } // Prevents the compiler from eliding the computations that led to "output". template HWY_API void PreventElision(T&& output) { #if HWY_COMPILER_MSVC // MSVC does not support inline assembly anymore (and never supported GCC's // RTL constraints). Self-assignment with #pragma optimize("off") might be // expected to prevent elision, but it does not with MSVC 2015. Type-punning // with volatile pointers generates inefficient code on MSVC 2017. static std::atomic> dummy; dummy.store(output, std::memory_order_relaxed); #else // Works by indicating to the compiler that "output" is being read and // modified. The +r constraint avoids unnecessary writes to memory, but only // works for built-in types (typically FuncOutput). asm volatile("" : "+r"(output) : : "memory"); #endif } } // namespace hwy #endif // HIGHWAY_HWY_BASE_H_